blob: 193d898f1b2760b91e0bd3d56fe6aa158db7cb52 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000046#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000047#include <windows.h>
48#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000049
Larry Hastings61272b72014-01-07 12:41:53 -080050/*[clinic input]
Larry Hastingsc2047262014-01-25 20:43:29 -080051class str "PyUnicodeObject *" "&PyUnicode_Type"
Larry Hastings61272b72014-01-07 12:41:53 -080052[clinic start generated code]*/
Larry Hastings581ee362014-01-28 05:00:08 -080053/*[clinic end generated code: output=da39a3ee5e6b4b0d input=604e916854800fa8]*/
Larry Hastings44e2eaa2013-11-23 15:37:55 -080054
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000055/* --- Globals ------------------------------------------------------------
56
Serhiy Storchaka05997252013-01-26 12:14:02 +020057NOTE: In the interpreter's initialization phase, some globals are currently
58 initialized dynamically as needed. In the process Unicode objects may
59 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000060
61*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000062
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000063
64#ifdef __cplusplus
65extern "C" {
66#endif
67
Victor Stinner8faf8212011-12-08 22:14:11 +010068/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
69#define MAX_UNICODE 0x10ffff
70
Victor Stinner910337b2011-10-03 03:20:16 +020071#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020072# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020073#else
74# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
75#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020076
Victor Stinnere90fe6a2011-10-01 16:48:13 +020077#define _PyUnicode_UTF8(op) \
78 (((PyCompactUnicodeObject*)(op))->utf8)
79#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020080 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020081 assert(PyUnicode_IS_READY(op)), \
82 PyUnicode_IS_COMPACT_ASCII(op) ? \
83 ((char*)((PyASCIIObject*)(op) + 1)) : \
84 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +020085#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020086 (((PyCompactUnicodeObject*)(op))->utf8_length)
87#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020088 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020089 assert(PyUnicode_IS_READY(op)), \
90 PyUnicode_IS_COMPACT_ASCII(op) ? \
91 ((PyASCIIObject*)(op))->length : \
92 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +020093#define _PyUnicode_WSTR(op) \
94 (((PyASCIIObject*)(op))->wstr)
95#define _PyUnicode_WSTR_LENGTH(op) \
96 (((PyCompactUnicodeObject*)(op))->wstr_length)
97#define _PyUnicode_LENGTH(op) \
98 (((PyASCIIObject *)(op))->length)
99#define _PyUnicode_STATE(op) \
100 (((PyASCIIObject *)(op))->state)
101#define _PyUnicode_HASH(op) \
102 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200103#define _PyUnicode_KIND(op) \
104 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200105 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200106#define _PyUnicode_GET_LENGTH(op) \
107 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200108 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200109#define _PyUnicode_DATA_ANY(op) \
110 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200111
Victor Stinner910337b2011-10-03 03:20:16 +0200112#undef PyUnicode_READY
113#define PyUnicode_READY(op) \
114 (assert(_PyUnicode_CHECK(op)), \
115 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200116 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100117 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200118
Victor Stinnerc379ead2011-10-03 12:52:27 +0200119#define _PyUnicode_SHARE_UTF8(op) \
120 (assert(_PyUnicode_CHECK(op)), \
121 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
122 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
123#define _PyUnicode_SHARE_WSTR(op) \
124 (assert(_PyUnicode_CHECK(op)), \
125 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
126
Victor Stinner829c0ad2011-10-03 01:08:02 +0200127/* true if the Unicode object has an allocated UTF-8 memory block
128 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200129#define _PyUnicode_HAS_UTF8_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200130 ((!PyUnicode_IS_COMPACT_ASCII(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200131 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200132 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
133
Victor Stinner03490912011-10-03 23:45:12 +0200134/* true if the Unicode object has an allocated wstr memory block
135 (not shared with other data) */
136#define _PyUnicode_HAS_WSTR_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200137 ((_PyUnicode_WSTR(op) && \
Victor Stinner03490912011-10-03 23:45:12 +0200138 (!PyUnicode_IS_READY(op) || \
139 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
140
Victor Stinner910337b2011-10-03 03:20:16 +0200141/* Generic helper macro to convert characters of different types.
142 from_type and to_type have to be valid type names, begin and end
143 are pointers to the source characters which should be of type
144 "from_type *". to is a pointer of type "to_type *" and points to the
145 buffer where the result characters are written to. */
146#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
147 do { \
Victor Stinner4a587072013-11-19 12:54:53 +0100148 to_type *_to = (to_type *)(to); \
149 const from_type *_iter = (from_type *)(begin); \
150 const from_type *_end = (from_type *)(end); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200151 Py_ssize_t n = (_end) - (_iter); \
152 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200153 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200154 while (_iter < (_unrolled_end)) { \
155 _to[0] = (to_type) _iter[0]; \
156 _to[1] = (to_type) _iter[1]; \
157 _to[2] = (to_type) _iter[2]; \
158 _to[3] = (to_type) _iter[3]; \
159 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200160 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200161 while (_iter < (_end)) \
162 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200163 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200164
Walter Dörwald16807132007-05-25 13:52:07 +0000165/* This dictionary holds all interned unicode strings. Note that references
166 to strings in this dictionary are *not* counted in the string's ob_refcnt.
167 When the interned string reaches a refcnt of 0 the string deallocation
168 function will delete the reference from this dictionary.
169
170 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000171 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000172*/
Serhiy Storchaka05997252013-01-26 12:14:02 +0200173static PyObject *interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +0000174
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000175/* The empty Unicode object is shared to improve performance. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200176static PyObject *unicode_empty = NULL;
Serhiy Storchaka05997252013-01-26 12:14:02 +0200177
Serhiy Storchaka678db842013-01-26 12:16:36 +0200178#define _Py_INCREF_UNICODE_EMPTY() \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200179 do { \
180 if (unicode_empty != NULL) \
181 Py_INCREF(unicode_empty); \
182 else { \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200183 unicode_empty = PyUnicode_New(0, 0); \
184 if (unicode_empty != NULL) { \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200185 Py_INCREF(unicode_empty); \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200186 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
187 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200188 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200189 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000190
Serhiy Storchaka678db842013-01-26 12:16:36 +0200191#define _Py_RETURN_UNICODE_EMPTY() \
192 do { \
193 _Py_INCREF_UNICODE_EMPTY(); \
194 return unicode_empty; \
195 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000196
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200197/* Forward declaration */
198Py_LOCAL_INLINE(int)
199_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
200
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200201/* List of static strings. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200202static _Py_Identifier *static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200203
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000204/* Single character Unicode strings in the Latin-1 range are being
205 shared as well. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200206static PyObject *unicode_latin1[256] = {NULL};
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000207
Christian Heimes190d79e2008-01-30 11:58:22 +0000208/* Fast detection of the most frequent whitespace characters */
209const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000210 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000211/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000212/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000213/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000214/* case 0x000C: * FORM FEED */
215/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000216 0, 1, 1, 1, 1, 1, 0, 0,
217 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000218/* case 0x001C: * FILE SEPARATOR */
219/* case 0x001D: * GROUP SEPARATOR */
220/* case 0x001E: * RECORD SEPARATOR */
221/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000222 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000223/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000224 1, 0, 0, 0, 0, 0, 0, 0,
225 0, 0, 0, 0, 0, 0, 0, 0,
226 0, 0, 0, 0, 0, 0, 0, 0,
227 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000228
Benjamin Peterson14339b62009-01-31 16:36:08 +0000229 0, 0, 0, 0, 0, 0, 0, 0,
230 0, 0, 0, 0, 0, 0, 0, 0,
231 0, 0, 0, 0, 0, 0, 0, 0,
232 0, 0, 0, 0, 0, 0, 0, 0,
233 0, 0, 0, 0, 0, 0, 0, 0,
234 0, 0, 0, 0, 0, 0, 0, 0,
235 0, 0, 0, 0, 0, 0, 0, 0,
236 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000237};
238
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200239/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200240static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200241static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100242static int unicode_modifiable(PyObject *unicode);
243
Victor Stinnerfe226c02011-10-03 03:52:20 +0200244
Alexander Belopolsky40018472011-02-26 01:02:56 +0000245static PyObject *
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100246_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200247static PyObject *
248_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
249static PyObject *
250_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
251
252static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000253unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000254 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100255 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000256 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
257
Alexander Belopolsky40018472011-02-26 01:02:56 +0000258static void
259raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300260 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100261 PyObject *unicode,
262 Py_ssize_t startpos, Py_ssize_t endpos,
263 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000264
Christian Heimes190d79e2008-01-30 11:58:22 +0000265/* Same for linebreaks */
266static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000267 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000268/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000269/* 0x000B, * LINE TABULATION */
270/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000271/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000272 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000273 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000274/* 0x001C, * FILE SEPARATOR */
275/* 0x001D, * GROUP SEPARATOR */
276/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000277 0, 0, 0, 0, 1, 1, 1, 0,
278 0, 0, 0, 0, 0, 0, 0, 0,
279 0, 0, 0, 0, 0, 0, 0, 0,
280 0, 0, 0, 0, 0, 0, 0, 0,
281 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000282
Benjamin Peterson14339b62009-01-31 16:36:08 +0000283 0, 0, 0, 0, 0, 0, 0, 0,
284 0, 0, 0, 0, 0, 0, 0, 0,
285 0, 0, 0, 0, 0, 0, 0, 0,
286 0, 0, 0, 0, 0, 0, 0, 0,
287 0, 0, 0, 0, 0, 0, 0, 0,
288 0, 0, 0, 0, 0, 0, 0, 0,
289 0, 0, 0, 0, 0, 0, 0, 0,
290 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000291};
292
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300293/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
294 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000295Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000296PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000297{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000298#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000299 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000300#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000301 /* This is actually an illegal character, so it should
302 not be passed to unichr. */
303 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000304#endif
305}
306
Victor Stinner910337b2011-10-03 03:20:16 +0200307#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200308int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100309_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200310{
311 PyASCIIObject *ascii;
312 unsigned int kind;
313
314 assert(PyUnicode_Check(op));
315
316 ascii = (PyASCIIObject *)op;
317 kind = ascii->state.kind;
318
Victor Stinnera3b334d2011-10-03 13:53:37 +0200319 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200320 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200321 assert(ascii->state.ready == 1);
322 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200323 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200324 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200325 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200326
Victor Stinnera41463c2011-10-04 01:05:08 +0200327 if (ascii->state.compact == 1) {
328 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200329 assert(kind == PyUnicode_1BYTE_KIND
330 || kind == PyUnicode_2BYTE_KIND
331 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200332 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200333 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200334 assert (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100335 }
336 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200337 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
338
339 data = unicode->data.any;
340 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnere30c0a12011-11-04 20:54:05 +0100341 assert(ascii->length == 0);
342 assert(ascii->hash == -1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200343 assert(ascii->state.compact == 0);
344 assert(ascii->state.ascii == 0);
345 assert(ascii->state.ready == 0);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100346 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
Victor Stinnera41463c2011-10-04 01:05:08 +0200347 assert(ascii->wstr != NULL);
348 assert(data == NULL);
349 assert(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200350 }
351 else {
352 assert(kind == PyUnicode_1BYTE_KIND
353 || kind == PyUnicode_2BYTE_KIND
354 || kind == PyUnicode_4BYTE_KIND);
355 assert(ascii->state.compact == 0);
356 assert(ascii->state.ready == 1);
357 assert(data != NULL);
358 if (ascii->state.ascii) {
359 assert (compact->utf8 == data);
360 assert (compact->utf8_length == ascii->length);
361 }
362 else
363 assert (compact->utf8 != data);
364 }
365 }
366 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200367 if (
368#if SIZEOF_WCHAR_T == 2
369 kind == PyUnicode_2BYTE_KIND
370#else
371 kind == PyUnicode_4BYTE_KIND
372#endif
373 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200374 {
375 assert(ascii->wstr == data);
376 assert(compact->wstr_length == ascii->length);
377 } else
378 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200379 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200380
381 if (compact->utf8 == NULL)
382 assert(compact->utf8_length == 0);
383 if (ascii->wstr == NULL)
384 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200385 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200386 /* check that the best kind is used */
387 if (check_content && kind != PyUnicode_WCHAR_KIND)
388 {
389 Py_ssize_t i;
390 Py_UCS4 maxchar = 0;
Victor Stinner718fbf02012-04-26 00:39:37 +0200391 void *data;
392 Py_UCS4 ch;
393
394 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200395 for (i=0; i < ascii->length; i++)
396 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200397 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200398 if (ch > maxchar)
399 maxchar = ch;
400 }
401 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100402 if (ascii->state.ascii == 0) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200403 assert(maxchar >= 128);
Victor Stinner77faf692011-11-20 18:56:05 +0100404 assert(maxchar <= 255);
405 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200406 else
407 assert(maxchar < 128);
408 }
Victor Stinner77faf692011-11-20 18:56:05 +0100409 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200410 assert(maxchar >= 0x100);
Victor Stinner77faf692011-11-20 18:56:05 +0100411 assert(maxchar <= 0xFFFF);
412 }
413 else {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200414 assert(maxchar >= 0x10000);
Victor Stinner8faf8212011-12-08 22:14:11 +0100415 assert(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100416 }
Victor Stinner718fbf02012-04-26 00:39:37 +0200417 assert(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200418 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400419 return 1;
420}
Victor Stinner910337b2011-10-03 03:20:16 +0200421#endif
422
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100423static PyObject*
424unicode_result_wchar(PyObject *unicode)
425{
426#ifndef Py_DEBUG
427 Py_ssize_t len;
428
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100429 len = _PyUnicode_WSTR_LENGTH(unicode);
430 if (len == 0) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100431 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200432 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100433 }
434
435 if (len == 1) {
436 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100437 if ((Py_UCS4)ch < 256) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100438 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
439 Py_DECREF(unicode);
440 return latin1_char;
441 }
442 }
443
444 if (_PyUnicode_Ready(unicode) < 0) {
Victor Stinneraa771272012-10-04 02:32:58 +0200445 Py_DECREF(unicode);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100446 return NULL;
447 }
448#else
Victor Stinneraa771272012-10-04 02:32:58 +0200449 assert(Py_REFCNT(unicode) == 1);
450
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100451 /* don't make the result ready in debug mode to ensure that the caller
452 makes the string ready before using it */
453 assert(_PyUnicode_CheckConsistency(unicode, 1));
454#endif
455 return unicode;
456}
457
458static PyObject*
459unicode_result_ready(PyObject *unicode)
460{
461 Py_ssize_t length;
462
463 length = PyUnicode_GET_LENGTH(unicode);
464 if (length == 0) {
465 if (unicode != unicode_empty) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100466 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200467 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100468 }
469 return unicode_empty;
470 }
471
472 if (length == 1) {
Victor Stinner69ed0f42013-04-09 21:48:24 +0200473 void *data = PyUnicode_DATA(unicode);
474 int kind = PyUnicode_KIND(unicode);
475 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100476 if (ch < 256) {
477 PyObject *latin1_char = unicode_latin1[ch];
478 if (latin1_char != NULL) {
479 if (unicode != latin1_char) {
480 Py_INCREF(latin1_char);
481 Py_DECREF(unicode);
482 }
483 return latin1_char;
484 }
485 else {
486 assert(_PyUnicode_CheckConsistency(unicode, 1));
487 Py_INCREF(unicode);
488 unicode_latin1[ch] = unicode;
489 return unicode;
490 }
491 }
492 }
493
494 assert(_PyUnicode_CheckConsistency(unicode, 1));
495 return unicode;
496}
497
498static PyObject*
499unicode_result(PyObject *unicode)
500{
501 assert(_PyUnicode_CHECK(unicode));
502 if (PyUnicode_IS_READY(unicode))
503 return unicode_result_ready(unicode);
504 else
505 return unicode_result_wchar(unicode);
506}
507
Victor Stinnerc4b49542011-12-11 22:44:26 +0100508static PyObject*
509unicode_result_unchanged(PyObject *unicode)
510{
511 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500512 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100513 return NULL;
514 Py_INCREF(unicode);
515 return unicode;
516 }
517 else
518 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100519 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100520}
521
Victor Stinner3a50e702011-10-18 21:21:00 +0200522#ifdef HAVE_MBCS
523static OSVERSIONINFOEX winver;
524#endif
525
Thomas Wouters477c8d52006-05-27 19:21:47 +0000526/* --- Bloom Filters ----------------------------------------------------- */
527
528/* stuff to implement simple "bloom filters" for Unicode characters.
529 to keep things simple, we use a single bitmask, using the least 5
530 bits from each unicode characters as the bit index. */
531
532/* the linebreak mask is set up by Unicode_Init below */
533
Antoine Pitrouf068f942010-01-13 14:19:12 +0000534#if LONG_BIT >= 128
535#define BLOOM_WIDTH 128
536#elif LONG_BIT >= 64
537#define BLOOM_WIDTH 64
538#elif LONG_BIT >= 32
539#define BLOOM_WIDTH 32
540#else
541#error "LONG_BIT is smaller than 32"
542#endif
543
Thomas Wouters477c8d52006-05-27 19:21:47 +0000544#define BLOOM_MASK unsigned long
545
Serhiy Storchaka05997252013-01-26 12:14:02 +0200546static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000547
Antoine Pitrouf068f942010-01-13 14:19:12 +0000548#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000549
Benjamin Peterson29060642009-01-31 22:14:21 +0000550#define BLOOM_LINEBREAK(ch) \
551 ((ch) < 128U ? ascii_linebreak[(ch)] : \
552 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000553
Alexander Belopolsky40018472011-02-26 01:02:56 +0000554Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200555make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000556{
Victor Stinnera85af502013-04-09 21:53:54 +0200557#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
558 do { \
559 TYPE *data = (TYPE *)PTR; \
560 TYPE *end = data + LEN; \
561 Py_UCS4 ch; \
562 for (; data != end; data++) { \
563 ch = *data; \
564 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
565 } \
566 break; \
567 } while (0)
568
Thomas Wouters477c8d52006-05-27 19:21:47 +0000569 /* calculate simple bloom-style bitmask for a given unicode string */
570
Antoine Pitrouf068f942010-01-13 14:19:12 +0000571 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000572
573 mask = 0;
Victor Stinnera85af502013-04-09 21:53:54 +0200574 switch (kind) {
575 case PyUnicode_1BYTE_KIND:
576 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
577 break;
578 case PyUnicode_2BYTE_KIND:
579 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
580 break;
581 case PyUnicode_4BYTE_KIND:
582 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
583 break;
584 default:
585 assert(0);
586 }
Thomas Wouters477c8d52006-05-27 19:21:47 +0000587 return mask;
Victor Stinnera85af502013-04-09 21:53:54 +0200588
589#undef BLOOM_UPDATE
Thomas Wouters477c8d52006-05-27 19:21:47 +0000590}
591
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200592/* Compilation of templated routines */
593
594#include "stringlib/asciilib.h"
595#include "stringlib/fastsearch.h"
596#include "stringlib/partition.h"
597#include "stringlib/split.h"
598#include "stringlib/count.h"
599#include "stringlib/find.h"
600#include "stringlib/find_max_char.h"
601#include "stringlib/localeutil.h"
602#include "stringlib/undef.h"
603
604#include "stringlib/ucs1lib.h"
605#include "stringlib/fastsearch.h"
606#include "stringlib/partition.h"
607#include "stringlib/split.h"
608#include "stringlib/count.h"
609#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300610#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200611#include "stringlib/find_max_char.h"
612#include "stringlib/localeutil.h"
613#include "stringlib/undef.h"
614
615#include "stringlib/ucs2lib.h"
616#include "stringlib/fastsearch.h"
617#include "stringlib/partition.h"
618#include "stringlib/split.h"
619#include "stringlib/count.h"
620#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300621#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200622#include "stringlib/find_max_char.h"
623#include "stringlib/localeutil.h"
624#include "stringlib/undef.h"
625
626#include "stringlib/ucs4lib.h"
627#include "stringlib/fastsearch.h"
628#include "stringlib/partition.h"
629#include "stringlib/split.h"
630#include "stringlib/count.h"
631#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300632#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200633#include "stringlib/find_max_char.h"
634#include "stringlib/localeutil.h"
635#include "stringlib/undef.h"
636
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200637#include "stringlib/unicodedefs.h"
638#include "stringlib/fastsearch.h"
639#include "stringlib/count.h"
640#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100641#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200642
Guido van Rossumd57fd912000-03-10 22:53:23 +0000643/* --- Unicode Object ----------------------------------------------------- */
644
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200645static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200646fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200647
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200648Py_LOCAL_INLINE(Py_ssize_t) findchar(void *s, int kind,
649 Py_ssize_t size, Py_UCS4 ch,
650 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200651{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200652 int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH;
653
654 switch (kind) {
655 case PyUnicode_1BYTE_KIND:
656 {
657 Py_UCS1 ch1 = (Py_UCS1) ch;
658 if (ch1 == ch)
659 return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode);
660 else
661 return -1;
662 }
663 case PyUnicode_2BYTE_KIND:
664 {
665 Py_UCS2 ch2 = (Py_UCS2) ch;
666 if (ch2 == ch)
667 return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode);
668 else
669 return -1;
670 }
671 case PyUnicode_4BYTE_KIND:
672 return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode);
673 default:
674 assert(0);
675 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200676 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200677}
678
Victor Stinnerafffce42012-10-03 23:03:17 +0200679#ifdef Py_DEBUG
680/* Fill the data of an Unicode string with invalid characters to detect bugs
681 earlier.
682
683 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
684 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
685 invalid character in Unicode 6.0. */
686static void
687unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
688{
689 int kind = PyUnicode_KIND(unicode);
690 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
691 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
692 if (length <= old_length)
693 return;
694 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
695}
696#endif
697
Victor Stinnerfe226c02011-10-03 03:52:20 +0200698static PyObject*
699resize_compact(PyObject *unicode, Py_ssize_t length)
700{
701 Py_ssize_t char_size;
702 Py_ssize_t struct_size;
703 Py_ssize_t new_size;
704 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100705 PyObject *new_unicode;
Victor Stinnerafffce42012-10-03 23:03:17 +0200706#ifdef Py_DEBUG
707 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
708#endif
709
Victor Stinner79891572012-05-03 13:43:07 +0200710 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200711 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +0100712 assert(PyUnicode_IS_COMPACT(unicode));
713
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200714 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100715 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +0200716 struct_size = sizeof(PyASCIIObject);
717 else
718 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200719 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200720
Victor Stinnerfe226c02011-10-03 03:52:20 +0200721 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
722 PyErr_NoMemory();
723 return NULL;
724 }
725 new_size = (struct_size + (length + 1) * char_size);
726
Serhiy Storchaka7aa69082015-12-03 01:02:03 +0200727 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
728 PyObject_DEL(_PyUnicode_UTF8(unicode));
729 _PyUnicode_UTF8(unicode) = NULL;
730 _PyUnicode_UTF8_LENGTH(unicode) = 0;
731 }
Victor Stinner84def372011-12-11 20:04:56 +0100732 _Py_DEC_REFTOTAL;
733 _Py_ForgetReference(unicode);
734
735 new_unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
736 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100737 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200738 PyErr_NoMemory();
739 return NULL;
740 }
Victor Stinner84def372011-12-11 20:04:56 +0100741 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200742 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +0100743
Victor Stinnerfe226c02011-10-03 03:52:20 +0200744 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200745 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200746 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100747 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +0200748 _PyUnicode_WSTR_LENGTH(unicode) = length;
749 }
Victor Stinnerbbbac2e2013-02-07 23:12:46 +0100750 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
751 PyObject_DEL(_PyUnicode_WSTR(unicode));
752 _PyUnicode_WSTR(unicode) = NULL;
753 }
Victor Stinnerafffce42012-10-03 23:03:17 +0200754#ifdef Py_DEBUG
755 unicode_fill_invalid(unicode, old_length);
756#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200757 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
758 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +0200759 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200760 return unicode;
761}
762
Alexander Belopolsky40018472011-02-26 01:02:56 +0000763static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200764resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000765{
Victor Stinner95663112011-10-04 01:03:50 +0200766 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100767 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200768 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200769 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000770
Victor Stinnerfe226c02011-10-03 03:52:20 +0200771 if (PyUnicode_IS_READY(unicode)) {
772 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200773 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200774 void *data;
Victor Stinnerafffce42012-10-03 23:03:17 +0200775#ifdef Py_DEBUG
776 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
777#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200778
779 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200780 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200781 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
782 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200783
784 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
785 PyErr_NoMemory();
786 return -1;
787 }
788 new_size = (length + 1) * char_size;
789
Victor Stinner7a9105a2011-12-12 00:13:42 +0100790 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
791 {
792 PyObject_DEL(_PyUnicode_UTF8(unicode));
793 _PyUnicode_UTF8(unicode) = NULL;
794 _PyUnicode_UTF8_LENGTH(unicode) = 0;
795 }
796
Victor Stinnerfe226c02011-10-03 03:52:20 +0200797 data = (PyObject *)PyObject_REALLOC(data, new_size);
798 if (data == NULL) {
799 PyErr_NoMemory();
800 return -1;
801 }
802 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200803 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200804 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200805 _PyUnicode_WSTR_LENGTH(unicode) = length;
806 }
807 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200808 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200809 _PyUnicode_UTF8_LENGTH(unicode) = length;
810 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200811 _PyUnicode_LENGTH(unicode) = length;
812 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinnerafffce42012-10-03 23:03:17 +0200813#ifdef Py_DEBUG
814 unicode_fill_invalid(unicode, old_length);
815#endif
Victor Stinner95663112011-10-04 01:03:50 +0200816 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200817 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200818 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200819 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200820 }
Victor Stinner95663112011-10-04 01:03:50 +0200821 assert(_PyUnicode_WSTR(unicode) != NULL);
822
823 /* check for integer overflow */
824 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
825 PyErr_NoMemory();
826 return -1;
827 }
Victor Stinner7a9105a2011-12-12 00:13:42 +0100828 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +0200829 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +0100830 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +0200831 if (!wstr) {
832 PyErr_NoMemory();
833 return -1;
834 }
835 _PyUnicode_WSTR(unicode) = wstr;
836 _PyUnicode_WSTR(unicode)[length] = 0;
837 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200838 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000839 return 0;
840}
841
Victor Stinnerfe226c02011-10-03 03:52:20 +0200842static PyObject*
843resize_copy(PyObject *unicode, Py_ssize_t length)
844{
845 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100846 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200847 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100848
Benjamin Petersonbac79492012-01-14 13:34:47 -0500849 if (PyUnicode_READY(unicode) == -1)
Victor Stinner7a9105a2011-12-12 00:13:42 +0100850 return NULL;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200851
852 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
853 if (copy == NULL)
854 return NULL;
855
856 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +0200857 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200858 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200859 }
860 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200861 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100862
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200863 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200864 if (w == NULL)
865 return NULL;
866 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
867 copy_length = Py_MIN(copy_length, length);
Victor Stinnerc6cf1ba2012-10-23 02:54:47 +0200868 Py_MEMCPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
869 copy_length * sizeof(wchar_t));
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200870 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200871 }
872}
873
Guido van Rossumd57fd912000-03-10 22:53:23 +0000874/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000875 Ux0000 terminated; some code (e.g. new_identifier)
876 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000877
878 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000879 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000880
881*/
882
Alexander Belopolsky40018472011-02-26 01:02:56 +0000883static PyUnicodeObject *
884_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000885{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +0200886 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200887 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000888
Thomas Wouters477c8d52006-05-27 19:21:47 +0000889 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000890 if (length == 0 && unicode_empty != NULL) {
891 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200892 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000893 }
894
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000895 /* Ensure we won't overflow the size. */
896 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
897 return (PyUnicodeObject *)PyErr_NoMemory();
898 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200899 if (length < 0) {
900 PyErr_SetString(PyExc_SystemError,
901 "Negative size passed to _PyUnicode_New");
902 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000903 }
904
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200905 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
906 if (unicode == NULL)
907 return NULL;
908 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
Victor Stinner68b674c2013-10-29 19:31:43 +0100909
910 _PyUnicode_WSTR_LENGTH(unicode) = length;
911 _PyUnicode_HASH(unicode) = -1;
912 _PyUnicode_STATE(unicode).interned = 0;
913 _PyUnicode_STATE(unicode).kind = 0;
914 _PyUnicode_STATE(unicode).compact = 0;
915 _PyUnicode_STATE(unicode).ready = 0;
916 _PyUnicode_STATE(unicode).ascii = 0;
917 _PyUnicode_DATA_ANY(unicode) = NULL;
918 _PyUnicode_LENGTH(unicode) = 0;
919 _PyUnicode_UTF8(unicode) = NULL;
920 _PyUnicode_UTF8_LENGTH(unicode) = 0;
921
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200922 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
923 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100924 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +0000925 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100926 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000927 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200928
Jeremy Hyltond8082792003-09-16 19:41:39 +0000929 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000930 * the caller fails before initializing str -- unicode_resize()
931 * reads str[0], and the Keep-Alive optimization can keep memory
932 * allocated for str alive across a call to unicode_dealloc(unicode).
933 * We don't want unicode_resize to read uninitialized memory in
934 * that case.
935 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200936 _PyUnicode_WSTR(unicode)[0] = 0;
937 _PyUnicode_WSTR(unicode)[length] = 0;
Victor Stinner68b674c2013-10-29 19:31:43 +0100938
Victor Stinner7931d9a2011-11-04 00:22:48 +0100939 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000940 return unicode;
941}
942
Victor Stinnerf42dc442011-10-02 23:33:16 +0200943static const char*
944unicode_kind_name(PyObject *unicode)
945{
Victor Stinner42dfd712011-10-03 14:41:45 +0200946 /* don't check consistency: unicode_kind_name() is called from
947 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +0200948 if (!PyUnicode_IS_COMPACT(unicode))
949 {
950 if (!PyUnicode_IS_READY(unicode))
951 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -0600952 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200953 {
954 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200955 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200956 return "legacy ascii";
957 else
958 return "legacy latin1";
959 case PyUnicode_2BYTE_KIND:
960 return "legacy UCS2";
961 case PyUnicode_4BYTE_KIND:
962 return "legacy UCS4";
963 default:
964 return "<legacy invalid kind>";
965 }
966 }
967 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -0600968 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +0200969 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200970 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200971 return "ascii";
972 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200973 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200974 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200975 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200976 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200977 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200978 default:
979 return "<invalid compact kind>";
980 }
981}
982
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200983#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200984/* Functions wrapping macros for use in debugger */
985char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200986 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200987}
988
989void *_PyUnicode_compact_data(void *unicode) {
990 return _PyUnicode_COMPACT_DATA(unicode);
991}
992void *_PyUnicode_data(void *unicode){
993 printf("obj %p\n", unicode);
994 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
995 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
996 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
997 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
998 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
999 return PyUnicode_DATA(unicode);
1000}
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001001
1002void
1003_PyUnicode_Dump(PyObject *op)
1004{
1005 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +02001006 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1007 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
1008 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +02001009
Victor Stinnera849a4b2011-10-03 12:12:11 +02001010 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +02001011 {
1012 if (ascii->state.ascii)
1013 data = (ascii + 1);
1014 else
1015 data = (compact + 1);
1016 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001017 else
1018 data = unicode->data.any;
Victor Stinner293f3f52014-07-01 08:57:10 +02001019 printf("%s: len=%" PY_FORMAT_SIZE_T "u, ",
1020 unicode_kind_name(op), ascii->length);
Victor Stinner0d60e872011-10-23 19:47:19 +02001021
Victor Stinnera849a4b2011-10-03 12:12:11 +02001022 if (ascii->wstr == data)
1023 printf("shared ");
1024 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +02001025
Victor Stinnera3b334d2011-10-03 13:53:37 +02001026 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinner293f3f52014-07-01 08:57:10 +02001027 printf(" (%" PY_FORMAT_SIZE_T "u), ", compact->wstr_length);
Victor Stinnera849a4b2011-10-03 12:12:11 +02001028 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
1029 printf("shared ");
Victor Stinner293f3f52014-07-01 08:57:10 +02001030 printf("utf8=%p (%" PY_FORMAT_SIZE_T "u)",
1031 compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001032 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001033 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001034}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001035#endif
1036
1037PyObject *
1038PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1039{
1040 PyObject *obj;
1041 PyCompactUnicodeObject *unicode;
1042 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +02001043 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001044 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001045 Py_ssize_t char_size;
1046 Py_ssize_t struct_size;
1047
1048 /* Optimization for empty strings */
1049 if (size == 0 && unicode_empty != NULL) {
1050 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001051 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001052 }
1053
Victor Stinner9e9d6892011-10-04 01:02:02 +02001054 is_ascii = 0;
1055 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001056 struct_size = sizeof(PyCompactUnicodeObject);
1057 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +02001058 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001059 char_size = 1;
1060 is_ascii = 1;
1061 struct_size = sizeof(PyASCIIObject);
1062 }
1063 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001064 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001065 char_size = 1;
1066 }
1067 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001068 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001069 char_size = 2;
1070 if (sizeof(wchar_t) == 2)
1071 is_sharing = 1;
1072 }
1073 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001074 if (maxchar > MAX_UNICODE) {
1075 PyErr_SetString(PyExc_SystemError,
1076 "invalid maximum character passed to PyUnicode_New");
1077 return NULL;
1078 }
Victor Stinner8f825062012-04-27 13:55:39 +02001079 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001080 char_size = 4;
1081 if (sizeof(wchar_t) == 4)
1082 is_sharing = 1;
1083 }
1084
1085 /* Ensure we won't overflow the size. */
1086 if (size < 0) {
1087 PyErr_SetString(PyExc_SystemError,
1088 "Negative size passed to PyUnicode_New");
1089 return NULL;
1090 }
1091 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1092 return PyErr_NoMemory();
1093
1094 /* Duplicated allocation code from _PyObject_New() instead of a call to
1095 * PyObject_New() so we are able to allocate space for the object and
1096 * it's data buffer.
1097 */
1098 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1099 if (obj == NULL)
1100 return PyErr_NoMemory();
1101 obj = PyObject_INIT(obj, &PyUnicode_Type);
1102 if (obj == NULL)
1103 return NULL;
1104
1105 unicode = (PyCompactUnicodeObject *)obj;
1106 if (is_ascii)
1107 data = ((PyASCIIObject*)obj) + 1;
1108 else
1109 data = unicode + 1;
1110 _PyUnicode_LENGTH(unicode) = size;
1111 _PyUnicode_HASH(unicode) = -1;
1112 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001113 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001114 _PyUnicode_STATE(unicode).compact = 1;
1115 _PyUnicode_STATE(unicode).ready = 1;
1116 _PyUnicode_STATE(unicode).ascii = is_ascii;
1117 if (is_ascii) {
1118 ((char*)data)[size] = 0;
1119 _PyUnicode_WSTR(unicode) = NULL;
1120 }
Victor Stinner8f825062012-04-27 13:55:39 +02001121 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001122 ((char*)data)[size] = 0;
1123 _PyUnicode_WSTR(unicode) = NULL;
1124 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001125 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001126 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001127 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001128 else {
1129 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001130 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001131 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001132 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001133 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001134 ((Py_UCS4*)data)[size] = 0;
1135 if (is_sharing) {
1136 _PyUnicode_WSTR_LENGTH(unicode) = size;
1137 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1138 }
1139 else {
1140 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1141 _PyUnicode_WSTR(unicode) = NULL;
1142 }
1143 }
Victor Stinner8f825062012-04-27 13:55:39 +02001144#ifdef Py_DEBUG
Victor Stinnerafffce42012-10-03 23:03:17 +02001145 unicode_fill_invalid((PyObject*)unicode, 0);
Victor Stinner8f825062012-04-27 13:55:39 +02001146#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001147 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001148 return obj;
1149}
1150
1151#if SIZEOF_WCHAR_T == 2
1152/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1153 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001154 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001155
1156 This function assumes that unicode can hold one more code point than wstr
1157 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001158static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001159unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001160 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001161{
1162 const wchar_t *iter;
1163 Py_UCS4 *ucs4_out;
1164
Victor Stinner910337b2011-10-03 03:20:16 +02001165 assert(unicode != NULL);
1166 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001167 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1168 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1169
1170 for (iter = begin; iter < end; ) {
1171 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1172 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001173 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1174 && (iter+1) < end
1175 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001176 {
Victor Stinner551ac952011-11-29 22:58:13 +01001177 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001178 iter += 2;
1179 }
1180 else {
1181 *ucs4_out++ = *iter;
1182 iter++;
1183 }
1184 }
1185 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1186 _PyUnicode_GET_LENGTH(unicode)));
1187
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001188}
1189#endif
1190
Victor Stinnercd9950f2011-10-02 00:34:53 +02001191static int
Victor Stinner488fa492011-12-12 00:01:39 +01001192unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001193{
Victor Stinner488fa492011-12-12 00:01:39 +01001194 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001195 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001196 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001197 return -1;
1198 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001199 return 0;
1200}
1201
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001202static int
1203_copy_characters(PyObject *to, Py_ssize_t to_start,
1204 PyObject *from, Py_ssize_t from_start,
1205 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001206{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001207 unsigned int from_kind, to_kind;
1208 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001209
Victor Stinneree4544c2012-05-09 22:24:08 +02001210 assert(0 <= how_many);
1211 assert(0 <= from_start);
1212 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001213 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001214 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001215 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001216
Victor Stinnerd3f08822012-05-29 12:57:52 +02001217 assert(PyUnicode_Check(to));
1218 assert(PyUnicode_IS_READY(to));
1219 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1220
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001221 if (how_many == 0)
1222 return 0;
1223
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001224 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001225 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001226 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001227 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001228
Victor Stinnerf1852262012-06-16 16:38:26 +02001229#ifdef Py_DEBUG
1230 if (!check_maxchar
1231 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1232 {
1233 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1234 Py_UCS4 ch;
1235 Py_ssize_t i;
1236 for (i=0; i < how_many; i++) {
1237 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1238 assert(ch <= to_maxchar);
1239 }
1240 }
1241#endif
1242
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001243 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001244 if (check_maxchar
1245 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1246 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001247 /* Writing Latin-1 characters into an ASCII string requires to
1248 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001249 Py_UCS4 max_char;
1250 max_char = ucs1lib_find_max_char(from_data,
1251 (Py_UCS1*)from_data + how_many);
1252 if (max_char >= 128)
1253 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001254 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001255 Py_MEMCPY((char*)to_data + to_kind * to_start,
1256 (char*)from_data + from_kind * from_start,
1257 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001258 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001259 else if (from_kind == PyUnicode_1BYTE_KIND
1260 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001261 {
1262 _PyUnicode_CONVERT_BYTES(
1263 Py_UCS1, Py_UCS2,
1264 PyUnicode_1BYTE_DATA(from) + from_start,
1265 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1266 PyUnicode_2BYTE_DATA(to) + to_start
1267 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001268 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001269 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001270 && to_kind == PyUnicode_4BYTE_KIND)
1271 {
1272 _PyUnicode_CONVERT_BYTES(
1273 Py_UCS1, Py_UCS4,
1274 PyUnicode_1BYTE_DATA(from) + from_start,
1275 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1276 PyUnicode_4BYTE_DATA(to) + to_start
1277 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001278 }
1279 else if (from_kind == PyUnicode_2BYTE_KIND
1280 && to_kind == PyUnicode_4BYTE_KIND)
1281 {
1282 _PyUnicode_CONVERT_BYTES(
1283 Py_UCS2, Py_UCS4,
1284 PyUnicode_2BYTE_DATA(from) + from_start,
1285 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1286 PyUnicode_4BYTE_DATA(to) + to_start
1287 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001288 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001289 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001290 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1291
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001292 if (!check_maxchar) {
1293 if (from_kind == PyUnicode_2BYTE_KIND
1294 && to_kind == PyUnicode_1BYTE_KIND)
1295 {
1296 _PyUnicode_CONVERT_BYTES(
1297 Py_UCS2, Py_UCS1,
1298 PyUnicode_2BYTE_DATA(from) + from_start,
1299 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1300 PyUnicode_1BYTE_DATA(to) + to_start
1301 );
1302 }
1303 else if (from_kind == PyUnicode_4BYTE_KIND
1304 && to_kind == PyUnicode_1BYTE_KIND)
1305 {
1306 _PyUnicode_CONVERT_BYTES(
1307 Py_UCS4, Py_UCS1,
1308 PyUnicode_4BYTE_DATA(from) + from_start,
1309 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1310 PyUnicode_1BYTE_DATA(to) + to_start
1311 );
1312 }
1313 else if (from_kind == PyUnicode_4BYTE_KIND
1314 && to_kind == PyUnicode_2BYTE_KIND)
1315 {
1316 _PyUnicode_CONVERT_BYTES(
1317 Py_UCS4, Py_UCS2,
1318 PyUnicode_4BYTE_DATA(from) + from_start,
1319 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1320 PyUnicode_2BYTE_DATA(to) + to_start
1321 );
1322 }
1323 else {
1324 assert(0);
1325 return -1;
1326 }
1327 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001328 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001329 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001330 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001331 Py_ssize_t i;
1332
Victor Stinnera0702ab2011-09-29 14:14:38 +02001333 for (i=0; i < how_many; i++) {
1334 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001335 if (ch > to_maxchar)
1336 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001337 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1338 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001339 }
1340 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001341 return 0;
1342}
1343
Victor Stinnerd3f08822012-05-29 12:57:52 +02001344void
1345_PyUnicode_FastCopyCharacters(
1346 PyObject *to, Py_ssize_t to_start,
1347 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001348{
1349 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1350}
1351
1352Py_ssize_t
1353PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1354 PyObject *from, Py_ssize_t from_start,
1355 Py_ssize_t how_many)
1356{
1357 int err;
1358
1359 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1360 PyErr_BadInternalCall();
1361 return -1;
1362 }
1363
Benjamin Petersonbac79492012-01-14 13:34:47 -05001364 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001365 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001366 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001367 return -1;
1368
Victor Stinnerd3f08822012-05-29 12:57:52 +02001369 if (from_start < 0) {
1370 PyErr_SetString(PyExc_IndexError, "string index out of range");
1371 return -1;
1372 }
1373 if (to_start < 0) {
1374 PyErr_SetString(PyExc_IndexError, "string index out of range");
1375 return -1;
1376 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001377 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1378 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1379 PyErr_Format(PyExc_SystemError,
Victor Stinnera33bce02014-07-04 22:47:46 +02001380 "Cannot write %zi characters at %zi "
1381 "in a string of %zi characters",
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001382 how_many, to_start, PyUnicode_GET_LENGTH(to));
1383 return -1;
1384 }
1385
1386 if (how_many == 0)
1387 return 0;
1388
Victor Stinner488fa492011-12-12 00:01:39 +01001389 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001390 return -1;
1391
1392 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1393 if (err) {
1394 PyErr_Format(PyExc_SystemError,
1395 "Cannot copy %s characters "
1396 "into a string of %s characters",
1397 unicode_kind_name(from),
1398 unicode_kind_name(to));
1399 return -1;
1400 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001401 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001402}
1403
Victor Stinner17222162011-09-28 22:15:37 +02001404/* Find the maximum code point and count the number of surrogate pairs so a
1405 correct string length can be computed before converting a string to UCS4.
1406 This function counts single surrogates as a character and not as a pair.
1407
1408 Return 0 on success, or -1 on error. */
1409static int
1410find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1411 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001412{
1413 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001414 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001415
Victor Stinnerc53be962011-10-02 21:33:54 +02001416 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001417 *num_surrogates = 0;
1418 *maxchar = 0;
1419
1420 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001421#if SIZEOF_WCHAR_T == 2
Victor Stinnercf77da92013-03-06 01:09:24 +01001422 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1423 && (iter+1) < end
1424 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1425 {
1426 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1427 ++(*num_surrogates);
1428 iter += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001429 }
1430 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001431#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001432 {
1433 ch = *iter;
1434 iter++;
1435 }
1436 if (ch > *maxchar) {
1437 *maxchar = ch;
1438 if (*maxchar > MAX_UNICODE) {
1439 PyErr_Format(PyExc_ValueError,
1440 "character U+%x is not in range [U+0000; U+10ffff]",
1441 ch);
1442 return -1;
1443 }
1444 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001445 }
1446 return 0;
1447}
1448
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001449int
1450_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001451{
1452 wchar_t *end;
1453 Py_UCS4 maxchar = 0;
1454 Py_ssize_t num_surrogates;
1455#if SIZEOF_WCHAR_T == 2
1456 Py_ssize_t length_wo_surrogates;
1457#endif
1458
Georg Brandl7597add2011-10-05 16:36:47 +02001459 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001460 strings were created using _PyObject_New() and where no canonical
1461 representation (the str field) has been set yet aka strings
1462 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001463 assert(_PyUnicode_CHECK(unicode));
1464 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001465 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001466 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001467 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001468 /* Actually, it should neither be interned nor be anything else: */
1469 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001470
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001471 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001472 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001473 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001474 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001475
1476 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001477 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1478 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001479 PyErr_NoMemory();
1480 return -1;
1481 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001482 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001483 _PyUnicode_WSTR(unicode), end,
1484 PyUnicode_1BYTE_DATA(unicode));
1485 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1486 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1487 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1488 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001489 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001490 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001491 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001492 }
1493 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001494 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001495 _PyUnicode_UTF8(unicode) = NULL;
1496 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001497 }
1498 PyObject_FREE(_PyUnicode_WSTR(unicode));
1499 _PyUnicode_WSTR(unicode) = NULL;
1500 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1501 }
1502 /* In this case we might have to convert down from 4-byte native
1503 wchar_t to 2-byte unicode. */
1504 else if (maxchar < 65536) {
1505 assert(num_surrogates == 0 &&
1506 "FindMaxCharAndNumSurrogatePairs() messed up");
1507
Victor Stinner506f5922011-09-28 22:34:18 +02001508#if SIZEOF_WCHAR_T == 2
1509 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001510 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001511 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1512 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1513 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001514 _PyUnicode_UTF8(unicode) = NULL;
1515 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001516#else
1517 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001518 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001519 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001520 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001521 PyErr_NoMemory();
1522 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001523 }
Victor Stinner506f5922011-09-28 22:34:18 +02001524 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1525 _PyUnicode_WSTR(unicode), end,
1526 PyUnicode_2BYTE_DATA(unicode));
1527 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1528 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1529 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001530 _PyUnicode_UTF8(unicode) = NULL;
1531 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001532 PyObject_FREE(_PyUnicode_WSTR(unicode));
1533 _PyUnicode_WSTR(unicode) = NULL;
1534 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1535#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001536 }
1537 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1538 else {
1539#if SIZEOF_WCHAR_T == 2
1540 /* in case the native representation is 2-bytes, we need to allocate a
1541 new normalized 4-byte version. */
1542 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Serhiy Storchakae55181f2015-02-20 21:34:06 +02001543 if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) {
1544 PyErr_NoMemory();
1545 return -1;
1546 }
Victor Stinnerc3c74152011-10-02 20:39:55 +02001547 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1548 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001549 PyErr_NoMemory();
1550 return -1;
1551 }
1552 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1553 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001554 _PyUnicode_UTF8(unicode) = NULL;
1555 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001556 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1557 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001558 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001559 PyObject_FREE(_PyUnicode_WSTR(unicode));
1560 _PyUnicode_WSTR(unicode) = NULL;
1561 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1562#else
1563 assert(num_surrogates == 0);
1564
Victor Stinnerc3c74152011-10-02 20:39:55 +02001565 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001566 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001567 _PyUnicode_UTF8(unicode) = NULL;
1568 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001569 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1570#endif
1571 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1572 }
1573 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001574 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001575 return 0;
1576}
1577
Alexander Belopolsky40018472011-02-26 01:02:56 +00001578static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001579unicode_dealloc(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001580{
Walter Dörwald16807132007-05-25 13:52:07 +00001581 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001582 case SSTATE_NOT_INTERNED:
1583 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001584
Benjamin Peterson29060642009-01-31 22:14:21 +00001585 case SSTATE_INTERNED_MORTAL:
1586 /* revive dead object temporarily for DelItem */
1587 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001588 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001589 Py_FatalError(
1590 "deletion of interned string failed");
1591 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001592
Benjamin Peterson29060642009-01-31 22:14:21 +00001593 case SSTATE_INTERNED_IMMORTAL:
1594 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001595
Benjamin Peterson29060642009-01-31 22:14:21 +00001596 default:
1597 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001598 }
1599
Victor Stinner03490912011-10-03 23:45:12 +02001600 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001601 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001602 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001603 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001604 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1605 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001606
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001607 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001608}
1609
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001610#ifdef Py_DEBUG
1611static int
1612unicode_is_singleton(PyObject *unicode)
1613{
1614 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1615 if (unicode == unicode_empty)
1616 return 1;
1617 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1618 {
1619 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1620 if (ch < 256 && unicode_latin1[ch] == unicode)
1621 return 1;
1622 }
1623 return 0;
1624}
1625#endif
1626
Alexander Belopolsky40018472011-02-26 01:02:56 +00001627static int
Victor Stinner488fa492011-12-12 00:01:39 +01001628unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001629{
Victor Stinner488fa492011-12-12 00:01:39 +01001630 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001631 if (Py_REFCNT(unicode) != 1)
1632 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001633 if (_PyUnicode_HASH(unicode) != -1)
1634 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001635 if (PyUnicode_CHECK_INTERNED(unicode))
1636 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001637 if (!PyUnicode_CheckExact(unicode))
1638 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001639#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001640 /* singleton refcount is greater than 1 */
1641 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001642#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001643 return 1;
1644}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001645
Victor Stinnerfe226c02011-10-03 03:52:20 +02001646static int
1647unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1648{
1649 PyObject *unicode;
1650 Py_ssize_t old_length;
1651
1652 assert(p_unicode != NULL);
1653 unicode = *p_unicode;
1654
1655 assert(unicode != NULL);
1656 assert(PyUnicode_Check(unicode));
1657 assert(0 <= length);
1658
Victor Stinner910337b2011-10-03 03:20:16 +02001659 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001660 old_length = PyUnicode_WSTR_LENGTH(unicode);
1661 else
1662 old_length = PyUnicode_GET_LENGTH(unicode);
1663 if (old_length == length)
1664 return 0;
1665
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001666 if (length == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02001667 _Py_INCREF_UNICODE_EMPTY();
1668 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00001669 return -1;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001670 Py_DECREF(*p_unicode);
1671 *p_unicode = unicode_empty;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001672 return 0;
1673 }
1674
Victor Stinner488fa492011-12-12 00:01:39 +01001675 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001676 PyObject *copy = resize_copy(unicode, length);
1677 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001678 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001679 Py_DECREF(*p_unicode);
1680 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001681 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001682 }
1683
Victor Stinnerfe226c02011-10-03 03:52:20 +02001684 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001685 PyObject *new_unicode = resize_compact(unicode, length);
1686 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001687 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001688 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001689 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001690 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001691 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001692}
1693
Alexander Belopolsky40018472011-02-26 01:02:56 +00001694int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001695PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001696{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001697 PyObject *unicode;
1698 if (p_unicode == NULL) {
1699 PyErr_BadInternalCall();
1700 return -1;
1701 }
1702 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001703 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001704 {
1705 PyErr_BadInternalCall();
1706 return -1;
1707 }
1708 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001709}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001710
Serhiy Storchakad65c9492015-11-02 14:10:23 +02001711/* Copy an ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01001712
Victor Stinnerb429d3b2012-02-22 21:22:20 +01001713 WARNING: The function doesn't copy the terminating null character and
1714 doesn't check the maximum character (may write a latin1 character in an
1715 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02001716static void
1717unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1718 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01001719{
1720 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1721 void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02001722 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01001723
1724 switch (kind) {
1725 case PyUnicode_1BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01001726 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner8c6db452012-10-06 00:40:45 +02001727#ifdef Py_DEBUG
1728 if (PyUnicode_IS_ASCII(unicode)) {
1729 Py_UCS4 maxchar = ucs1lib_find_max_char(
1730 (const Py_UCS1*)str,
1731 (const Py_UCS1*)str + len);
1732 assert(maxchar < 128);
1733 }
1734#endif
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01001735 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02001736 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001737 }
1738 case PyUnicode_2BYTE_KIND: {
1739 Py_UCS2 *start = (Py_UCS2 *)data + index;
1740 Py_UCS2 *ucs2 = start;
1741 assert(index <= PyUnicode_GET_LENGTH(unicode));
1742
Victor Stinner184252a2012-06-16 02:57:41 +02001743 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001744 *ucs2 = (Py_UCS2)*str;
1745
1746 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02001747 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001748 }
1749 default: {
1750 Py_UCS4 *start = (Py_UCS4 *)data + index;
1751 Py_UCS4 *ucs4 = start;
1752 assert(kind == PyUnicode_4BYTE_KIND);
1753 assert(index <= PyUnicode_GET_LENGTH(unicode));
1754
Victor Stinner184252a2012-06-16 02:57:41 +02001755 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001756 *ucs4 = (Py_UCS4)*str;
1757
1758 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01001759 }
1760 }
1761}
1762
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001763static PyObject*
1764get_latin1_char(unsigned char ch)
1765{
Victor Stinnera464fc12011-10-02 20:39:30 +02001766 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001767 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001768 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001769 if (!unicode)
1770 return NULL;
1771 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001772 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001773 unicode_latin1[ch] = unicode;
1774 }
1775 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001776 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001777}
1778
Victor Stinner985a82a2014-01-03 12:53:47 +01001779static PyObject*
1780unicode_char(Py_UCS4 ch)
1781{
1782 PyObject *unicode;
1783
1784 assert(ch <= MAX_UNICODE);
1785
Victor Stinnerf3b46b42014-01-03 13:16:00 +01001786 if (ch < 256)
1787 return get_latin1_char(ch);
1788
Victor Stinner985a82a2014-01-03 12:53:47 +01001789 unicode = PyUnicode_New(1, ch);
1790 if (unicode == NULL)
1791 return NULL;
1792 switch (PyUnicode_KIND(unicode)) {
1793 case PyUnicode_1BYTE_KIND:
1794 PyUnicode_1BYTE_DATA(unicode)[0] = (Py_UCS1)ch;
1795 break;
1796 case PyUnicode_2BYTE_KIND:
1797 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
1798 break;
1799 default:
1800 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1801 PyUnicode_4BYTE_DATA(unicode)[0] = ch;
1802 }
1803 assert(_PyUnicode_CheckConsistency(unicode, 1));
1804 return unicode;
1805}
1806
Alexander Belopolsky40018472011-02-26 01:02:56 +00001807PyObject *
1808PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001809{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001810 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001811 Py_UCS4 maxchar = 0;
1812 Py_ssize_t num_surrogates;
1813
1814 if (u == NULL)
1815 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001816
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001817 /* If the Unicode data is known at construction time, we can apply
1818 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001819
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001820 /* Optimization for empty strings */
Serhiy Storchaka678db842013-01-26 12:16:36 +02001821 if (size == 0)
1822 _Py_RETURN_UNICODE_EMPTY();
Tim Petersced69f82003-09-16 20:30:58 +00001823
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001824 /* Single character Unicode objects in the Latin-1 range are
1825 shared when using this constructor */
Victor Stinnerd21b58c2013-02-26 00:15:54 +01001826 if (size == 1 && (Py_UCS4)*u < 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001827 return get_latin1_char((unsigned char)*u);
1828
1829 /* If not empty and not single character, copy the Unicode data
1830 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001831 if (find_maxchar_surrogates(u, u + size,
1832 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001833 return NULL;
1834
Victor Stinner8faf8212011-12-08 22:14:11 +01001835 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001836 if (!unicode)
1837 return NULL;
1838
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001839 switch (PyUnicode_KIND(unicode)) {
1840 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001841 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001842 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1843 break;
1844 case PyUnicode_2BYTE_KIND:
1845#if Py_UNICODE_SIZE == 2
1846 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1847#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001848 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001849 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1850#endif
1851 break;
1852 case PyUnicode_4BYTE_KIND:
1853#if SIZEOF_WCHAR_T == 2
1854 /* This is the only case which has to process surrogates, thus
1855 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001856 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001857#else
1858 assert(num_surrogates == 0);
1859 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1860#endif
1861 break;
1862 default:
1863 assert(0 && "Impossible state");
1864 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001865
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001866 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001867}
1868
Alexander Belopolsky40018472011-02-26 01:02:56 +00001869PyObject *
1870PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001871{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001872 if (size < 0) {
1873 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001874 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001875 return NULL;
1876 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001877 if (u != NULL)
1878 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
1879 else
1880 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001881}
1882
Alexander Belopolsky40018472011-02-26 01:02:56 +00001883PyObject *
1884PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001885{
1886 size_t size = strlen(u);
1887 if (size > PY_SSIZE_T_MAX) {
1888 PyErr_SetString(PyExc_OverflowError, "input too long");
1889 return NULL;
1890 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001891 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001892}
1893
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001894PyObject *
1895_PyUnicode_FromId(_Py_Identifier *id)
1896{
1897 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01001898 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
1899 strlen(id->string),
1900 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001901 if (!id->object)
1902 return NULL;
1903 PyUnicode_InternInPlace(&id->object);
1904 assert(!id->next);
1905 id->next = static_strings;
1906 static_strings = id;
1907 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001908 return id->object;
1909}
1910
1911void
1912_PyUnicode_ClearStaticStrings()
1913{
Benjamin Peterson0c270a82013-01-09 09:52:01 -06001914 _Py_Identifier *tmp, *s = static_strings;
1915 while (s) {
Serhiy Storchaka505ff752014-02-09 13:33:53 +02001916 Py_CLEAR(s->object);
Benjamin Peterson0c270a82013-01-09 09:52:01 -06001917 tmp = s->next;
1918 s->next = NULL;
1919 s = tmp;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001920 }
Benjamin Peterson0c270a82013-01-09 09:52:01 -06001921 static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001922}
1923
Benjamin Peterson0df54292012-03-26 14:50:32 -04001924/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001925
Victor Stinnerd3f08822012-05-29 12:57:52 +02001926PyObject*
1927_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02001928{
Victor Stinnerd3f08822012-05-29 12:57:52 +02001929 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01001930 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01001931 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02001932#ifdef Py_DEBUG
Victor Stinnerd21b58c2013-02-26 00:15:54 +01001933 assert((unsigned char)s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02001934#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001935 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01001936 }
Victor Stinner785938e2011-12-11 20:09:03 +01001937 unicode = PyUnicode_New(size, 127);
1938 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02001939 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01001940 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
1941 assert(_PyUnicode_CheckConsistency(unicode, 1));
1942 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02001943}
1944
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001945static Py_UCS4
1946kind_maxchar_limit(unsigned int kind)
1947{
Benjamin Petersonead6b532011-12-20 17:23:42 -06001948 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001949 case PyUnicode_1BYTE_KIND:
1950 return 0x80;
1951 case PyUnicode_2BYTE_KIND:
1952 return 0x100;
1953 case PyUnicode_4BYTE_KIND:
1954 return 0x10000;
1955 default:
1956 assert(0 && "invalid kind");
Victor Stinner8faf8212011-12-08 22:14:11 +01001957 return MAX_UNICODE;
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001958 }
1959}
1960
Victor Stinnere6abb482012-05-02 01:15:40 +02001961Py_LOCAL_INLINE(Py_UCS4)
1962align_maxchar(Py_UCS4 maxchar)
1963{
1964 if (maxchar <= 127)
1965 return 127;
1966 else if (maxchar <= 255)
1967 return 255;
1968 else if (maxchar <= 65535)
1969 return 65535;
1970 else
1971 return MAX_UNICODE;
1972}
1973
Victor Stinner702c7342011-10-05 13:50:52 +02001974static PyObject*
Victor Stinnerd21b58c2013-02-26 00:15:54 +01001975_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001976{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001977 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001978 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001979
Serhiy Storchaka678db842013-01-26 12:16:36 +02001980 if (size == 0)
1981 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001982 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001983 if (size == 1)
1984 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001985
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001986 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001987 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001988 if (!res)
1989 return NULL;
1990 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001991 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001992 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001993}
1994
Victor Stinnere57b1c02011-09-28 22:20:48 +02001995static PyObject*
1996_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001997{
1998 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001999 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002000
Serhiy Storchaka678db842013-01-26 12:16:36 +02002001 if (size == 0)
2002 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002003 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002004 if (size == 1)
2005 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002006
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002007 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002008 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002009 if (!res)
2010 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002011 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002012 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002013 else {
2014 _PyUnicode_CONVERT_BYTES(
2015 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2016 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002017 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002018 return res;
2019}
2020
Victor Stinnere57b1c02011-09-28 22:20:48 +02002021static PyObject*
2022_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002023{
2024 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002025 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002026
Serhiy Storchaka678db842013-01-26 12:16:36 +02002027 if (size == 0)
2028 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002029 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002030 if (size == 1)
2031 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002032
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002033 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002034 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002035 if (!res)
2036 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002037 if (max_char < 256)
2038 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2039 PyUnicode_1BYTE_DATA(res));
2040 else if (max_char < 0x10000)
2041 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2042 PyUnicode_2BYTE_DATA(res));
2043 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002044 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002045 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002046 return res;
2047}
2048
2049PyObject*
2050PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2051{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002052 if (size < 0) {
2053 PyErr_SetString(PyExc_ValueError, "size must be positive");
2054 return NULL;
2055 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002056 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002057 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002058 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002059 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002060 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002061 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002062 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002063 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002064 PyErr_SetString(PyExc_SystemError, "invalid kind");
2065 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002066 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002067}
2068
Victor Stinnerece58de2012-04-23 23:36:38 +02002069Py_UCS4
2070_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2071{
2072 enum PyUnicode_Kind kind;
2073 void *startptr, *endptr;
2074
2075 assert(PyUnicode_IS_READY(unicode));
2076 assert(0 <= start);
2077 assert(end <= PyUnicode_GET_LENGTH(unicode));
2078 assert(start <= end);
2079
2080 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2081 return PyUnicode_MAX_CHAR_VALUE(unicode);
2082
2083 if (start == end)
2084 return 127;
2085
Victor Stinner94d558b2012-04-27 22:26:58 +02002086 if (PyUnicode_IS_ASCII(unicode))
2087 return 127;
2088
Victor Stinnerece58de2012-04-23 23:36:38 +02002089 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002090 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002091 endptr = (char *)startptr + end * kind;
2092 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002093 switch(kind) {
2094 case PyUnicode_1BYTE_KIND:
2095 return ucs1lib_find_max_char(startptr, endptr);
2096 case PyUnicode_2BYTE_KIND:
2097 return ucs2lib_find_max_char(startptr, endptr);
2098 case PyUnicode_4BYTE_KIND:
2099 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002100 default:
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002101 assert(0);
2102 return 0;
Victor Stinnerece58de2012-04-23 23:36:38 +02002103 }
2104}
2105
Victor Stinner25a4b292011-10-06 12:31:55 +02002106/* Ensure that a string uses the most efficient storage, if it is not the
2107 case: create a new string with of the right kind. Write NULL into *p_unicode
2108 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002109static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002110unicode_adjust_maxchar(PyObject **p_unicode)
2111{
2112 PyObject *unicode, *copy;
2113 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002114 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002115 unsigned int kind;
2116
2117 assert(p_unicode != NULL);
2118 unicode = *p_unicode;
2119 assert(PyUnicode_IS_READY(unicode));
2120 if (PyUnicode_IS_ASCII(unicode))
2121 return;
2122
2123 len = PyUnicode_GET_LENGTH(unicode);
2124 kind = PyUnicode_KIND(unicode);
2125 if (kind == PyUnicode_1BYTE_KIND) {
2126 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002127 max_char = ucs1lib_find_max_char(u, u + len);
2128 if (max_char >= 128)
2129 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002130 }
2131 else if (kind == PyUnicode_2BYTE_KIND) {
2132 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002133 max_char = ucs2lib_find_max_char(u, u + len);
2134 if (max_char >= 256)
2135 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002136 }
2137 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002138 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02002139 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002140 max_char = ucs4lib_find_max_char(u, u + len);
2141 if (max_char >= 0x10000)
2142 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002143 }
Victor Stinner25a4b292011-10-06 12:31:55 +02002144 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002145 if (copy != NULL)
2146 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002147 Py_DECREF(unicode);
2148 *p_unicode = copy;
2149}
2150
Victor Stinner034f6cf2011-09-30 02:26:44 +02002151PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002152_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002153{
Victor Stinner87af4f22011-11-21 23:03:47 +01002154 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002155 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002156
Victor Stinner034f6cf2011-09-30 02:26:44 +02002157 if (!PyUnicode_Check(unicode)) {
2158 PyErr_BadInternalCall();
2159 return NULL;
2160 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002161 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002162 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002163
Victor Stinner87af4f22011-11-21 23:03:47 +01002164 length = PyUnicode_GET_LENGTH(unicode);
2165 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002166 if (!copy)
2167 return NULL;
2168 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2169
Victor Stinner87af4f22011-11-21 23:03:47 +01002170 Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2171 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002172 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002173 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002174}
2175
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002176
Victor Stinnerbc603d12011-10-02 01:00:40 +02002177/* Widen Unicode objects to larger buffers. Don't write terminating null
2178 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002179
2180void*
2181_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2182{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002183 Py_ssize_t len;
2184 void *result;
2185 unsigned int skind;
2186
Benjamin Petersonbac79492012-01-14 13:34:47 -05002187 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02002188 return NULL;
2189
2190 len = PyUnicode_GET_LENGTH(s);
2191 skind = PyUnicode_KIND(s);
2192 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002193 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002194 return NULL;
2195 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002196 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002197 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002198 result = PyMem_New(Py_UCS2, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002199 if (!result)
2200 return PyErr_NoMemory();
2201 assert(skind == PyUnicode_1BYTE_KIND);
2202 _PyUnicode_CONVERT_BYTES(
2203 Py_UCS1, Py_UCS2,
2204 PyUnicode_1BYTE_DATA(s),
2205 PyUnicode_1BYTE_DATA(s) + len,
2206 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002207 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002208 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002209 result = PyMem_New(Py_UCS4, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002210 if (!result)
2211 return PyErr_NoMemory();
2212 if (skind == PyUnicode_2BYTE_KIND) {
2213 _PyUnicode_CONVERT_BYTES(
2214 Py_UCS2, Py_UCS4,
2215 PyUnicode_2BYTE_DATA(s),
2216 PyUnicode_2BYTE_DATA(s) + len,
2217 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002218 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002219 else {
2220 assert(skind == PyUnicode_1BYTE_KIND);
2221 _PyUnicode_CONVERT_BYTES(
2222 Py_UCS1, Py_UCS4,
2223 PyUnicode_1BYTE_DATA(s),
2224 PyUnicode_1BYTE_DATA(s) + len,
2225 result);
2226 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002227 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002228 default:
2229 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002230 }
Victor Stinner01698042011-10-04 00:04:26 +02002231 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002232 return NULL;
2233}
2234
2235static Py_UCS4*
2236as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2237 int copy_null)
2238{
2239 int kind;
2240 void *data;
2241 Py_ssize_t len, targetlen;
2242 if (PyUnicode_READY(string) == -1)
2243 return NULL;
2244 kind = PyUnicode_KIND(string);
2245 data = PyUnicode_DATA(string);
2246 len = PyUnicode_GET_LENGTH(string);
2247 targetlen = len;
2248 if (copy_null)
2249 targetlen++;
2250 if (!target) {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002251 target = PyMem_New(Py_UCS4, targetlen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002252 if (!target) {
2253 PyErr_NoMemory();
2254 return NULL;
2255 }
2256 }
2257 else {
2258 if (targetsize < targetlen) {
2259 PyErr_Format(PyExc_SystemError,
2260 "string is longer than the buffer");
2261 if (copy_null && 0 < targetsize)
2262 target[0] = 0;
2263 return NULL;
2264 }
2265 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002266 if (kind == PyUnicode_1BYTE_KIND) {
2267 Py_UCS1 *start = (Py_UCS1 *) data;
2268 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002269 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002270 else if (kind == PyUnicode_2BYTE_KIND) {
2271 Py_UCS2 *start = (Py_UCS2 *) data;
2272 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2273 }
2274 else {
2275 assert(kind == PyUnicode_4BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002276 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002277 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002278 if (copy_null)
2279 target[len] = 0;
2280 return target;
2281}
2282
2283Py_UCS4*
2284PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2285 int copy_null)
2286{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002287 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002288 PyErr_BadInternalCall();
2289 return NULL;
2290 }
2291 return as_ucs4(string, target, targetsize, copy_null);
2292}
2293
2294Py_UCS4*
2295PyUnicode_AsUCS4Copy(PyObject *string)
2296{
2297 return as_ucs4(string, NULL, 0, 1);
2298}
2299
2300#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00002301
Alexander Belopolsky40018472011-02-26 01:02:56 +00002302PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02002303PyUnicode_FromWideChar(const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002304{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002305 if (w == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002306 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02002307 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson29060642009-01-31 22:14:21 +00002308 PyErr_BadInternalCall();
2309 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002310 }
2311
Martin v. Löwis790465f2008-04-05 20:41:37 +00002312 if (size == -1) {
2313 size = wcslen(w);
2314 }
2315
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002316 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002317}
2318
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002319#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00002320
Walter Dörwald346737f2007-05-31 10:44:43 +00002321static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002322makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
Victor Stinnere215d962012-10-06 23:03:36 +02002323 char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00002324{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002325 *fmt++ = '%';
Benjamin Peterson14339b62009-01-31 16:36:08 +00002326 if (longflag)
2327 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002328 else if (longlongflag) {
2329 /* longlongflag should only ever be nonzero on machines with
2330 HAVE_LONG_LONG defined */
2331#ifdef HAVE_LONG_LONG
2332 char *f = PY_FORMAT_LONG_LONG;
2333 while (*f)
2334 *fmt++ = *f++;
2335#else
2336 /* we shouldn't ever get here */
2337 assert(0);
2338 *fmt++ = 'l';
2339#endif
2340 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002341 else if (size_tflag) {
2342 char *f = PY_FORMAT_SIZE_T;
2343 while (*f)
2344 *fmt++ = *f++;
2345 }
2346 *fmt++ = c;
2347 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00002348}
2349
Victor Stinner15a11362012-10-06 23:48:20 +02002350/* maximum number of characters required for output of %lld or %p.
Victor Stinnere215d962012-10-06 23:03:36 +02002351 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2352 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2353#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
Victor Stinner96865452011-03-01 23:44:09 +00002354
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002355static int
2356unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2357 Py_ssize_t width, Py_ssize_t precision)
2358{
2359 Py_ssize_t length, fill, arglen;
2360 Py_UCS4 maxchar;
2361
2362 if (PyUnicode_READY(str) == -1)
2363 return -1;
2364
2365 length = PyUnicode_GET_LENGTH(str);
2366 if ((precision == -1 || precision >= length)
2367 && width <= length)
2368 return _PyUnicodeWriter_WriteStr(writer, str);
2369
2370 if (precision != -1)
2371 length = Py_MIN(precision, length);
2372
2373 arglen = Py_MAX(length, width);
2374 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2375 maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2376 else
2377 maxchar = writer->maxchar;
2378
2379 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2380 return -1;
2381
2382 if (width > length) {
2383 fill = width - length;
2384 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2385 return -1;
2386 writer->pos += fill;
2387 }
2388
2389 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2390 str, 0, length);
2391 writer->pos += length;
2392 return 0;
2393}
2394
2395static int
2396unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
2397 Py_ssize_t width, Py_ssize_t precision)
2398{
2399 /* UTF-8 */
2400 Py_ssize_t length;
2401 PyObject *unicode;
2402 int res;
2403
2404 length = strlen(str);
2405 if (precision != -1)
2406 length = Py_MIN(length, precision);
2407 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2408 if (unicode == NULL)
2409 return -1;
2410
2411 res = unicode_fromformat_write_str(writer, unicode, width, -1);
2412 Py_DECREF(unicode);
2413 return res;
2414}
2415
Victor Stinner96865452011-03-01 23:44:09 +00002416static const char*
Victor Stinnere215d962012-10-06 23:03:36 +02002417unicode_fromformat_arg(_PyUnicodeWriter *writer,
2418 const char *f, va_list *vargs)
Victor Stinner96865452011-03-01 23:44:09 +00002419{
Victor Stinnere215d962012-10-06 23:03:36 +02002420 const char *p;
2421 Py_ssize_t len;
2422 int zeropad;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002423 Py_ssize_t width;
2424 Py_ssize_t precision;
Victor Stinnere215d962012-10-06 23:03:36 +02002425 int longflag;
2426 int longlongflag;
2427 int size_tflag;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002428 Py_ssize_t fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002429
2430 p = f;
2431 f++;
Victor Stinner4c63a972012-10-06 23:55:33 +02002432 zeropad = 0;
2433 if (*f == '0') {
2434 zeropad = 1;
2435 f++;
2436 }
Victor Stinner96865452011-03-01 23:44:09 +00002437
2438 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002439 width = -1;
2440 if (Py_ISDIGIT((unsigned)*f)) {
2441 width = *f - '0';
Victor Stinner96865452011-03-01 23:44:09 +00002442 f++;
Victor Stinnere215d962012-10-06 23:03:36 +02002443 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002444 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
Victor Stinner3921e902012-10-06 23:05:00 +02002445 PyErr_SetString(PyExc_ValueError,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002446 "width too big");
Victor Stinner3921e902012-10-06 23:05:00 +02002447 return NULL;
2448 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002449 width = (width * 10) + (*f - '0');
Victor Stinnere215d962012-10-06 23:03:36 +02002450 f++;
2451 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002452 }
2453 precision = -1;
2454 if (*f == '.') {
2455 f++;
2456 if (Py_ISDIGIT((unsigned)*f)) {
2457 precision = (*f - '0');
2458 f++;
2459 while (Py_ISDIGIT((unsigned)*f)) {
2460 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2461 PyErr_SetString(PyExc_ValueError,
2462 "precision too big");
2463 return NULL;
2464 }
2465 precision = (precision * 10) + (*f - '0');
2466 f++;
2467 }
2468 }
Victor Stinner96865452011-03-01 23:44:09 +00002469 if (*f == '%') {
2470 /* "%.3%s" => f points to "3" */
2471 f--;
2472 }
2473 }
2474 if (*f == '\0') {
Victor Stinnere215d962012-10-06 23:03:36 +02002475 /* bogus format "%.123" => go backward, f points to "3" */
Victor Stinner96865452011-03-01 23:44:09 +00002476 f--;
2477 }
Victor Stinner96865452011-03-01 23:44:09 +00002478
2479 /* Handle %ld, %lu, %lld and %llu. */
2480 longflag = 0;
2481 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002482 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002483 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002484 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002485 longflag = 1;
2486 ++f;
2487 }
2488#ifdef HAVE_LONG_LONG
2489 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002490 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002491 longlongflag = 1;
2492 f += 2;
2493 }
2494#endif
2495 }
2496 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002497 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002498 size_tflag = 1;
2499 ++f;
2500 }
Victor Stinnere215d962012-10-06 23:03:36 +02002501
2502 if (f[1] == '\0')
2503 writer->overallocate = 0;
2504
2505 switch (*f) {
2506 case 'c':
2507 {
2508 int ordinal = va_arg(*vargs, int);
Victor Stinnerff5a8482012-10-06 23:05:45 +02002509 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Serhiy Storchakac89533f2013-06-23 20:21:16 +03002510 PyErr_SetString(PyExc_OverflowError,
Victor Stinnerff5a8482012-10-06 23:05:45 +02002511 "character argument not in range(0x110000)");
2512 return NULL;
2513 }
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002514 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002515 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002516 break;
2517 }
2518
2519 case 'i':
2520 case 'd':
2521 case 'u':
2522 case 'x':
2523 {
2524 /* used by sprintf */
2525 char fmt[10]; /* should be enough for "%0lld\0" */
Victor Stinner15a11362012-10-06 23:48:20 +02002526 char buffer[MAX_LONG_LONG_CHARS];
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002527 Py_ssize_t arglen;
Victor Stinnere215d962012-10-06 23:03:36 +02002528
2529 if (*f == 'u') {
2530 makefmt(fmt, longflag, longlongflag, size_tflag, *f);
2531
2532 if (longflag)
2533 len = sprintf(buffer, fmt,
2534 va_arg(*vargs, unsigned long));
2535#ifdef HAVE_LONG_LONG
2536 else if (longlongflag)
2537 len = sprintf(buffer, fmt,
2538 va_arg(*vargs, unsigned PY_LONG_LONG));
2539#endif
2540 else if (size_tflag)
2541 len = sprintf(buffer, fmt,
2542 va_arg(*vargs, size_t));
2543 else
2544 len = sprintf(buffer, fmt,
2545 va_arg(*vargs, unsigned int));
2546 }
2547 else if (*f == 'x') {
2548 makefmt(fmt, 0, 0, 0, 'x');
2549 len = sprintf(buffer, fmt, va_arg(*vargs, int));
2550 }
2551 else {
2552 makefmt(fmt, longflag, longlongflag, size_tflag, *f);
2553
2554 if (longflag)
2555 len = sprintf(buffer, fmt,
2556 va_arg(*vargs, long));
2557#ifdef HAVE_LONG_LONG
2558 else if (longlongflag)
2559 len = sprintf(buffer, fmt,
2560 va_arg(*vargs, PY_LONG_LONG));
2561#endif
2562 else if (size_tflag)
2563 len = sprintf(buffer, fmt,
2564 va_arg(*vargs, Py_ssize_t));
2565 else
2566 len = sprintf(buffer, fmt,
2567 va_arg(*vargs, int));
2568 }
2569 assert(len >= 0);
2570
Victor Stinnere215d962012-10-06 23:03:36 +02002571 if (precision < len)
2572 precision = len;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002573
2574 arglen = Py_MAX(precision, width);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002575 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2576 return NULL;
2577
Victor Stinnere215d962012-10-06 23:03:36 +02002578 if (width > precision) {
2579 Py_UCS4 fillchar;
2580 fill = width - precision;
2581 fillchar = zeropad?'0':' ';
Victor Stinner15a11362012-10-06 23:48:20 +02002582 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2583 return NULL;
2584 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002585 }
Victor Stinner15a11362012-10-06 23:48:20 +02002586 if (precision > len) {
Victor Stinnere215d962012-10-06 23:03:36 +02002587 fill = precision - len;
Victor Stinner15a11362012-10-06 23:48:20 +02002588 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2589 return NULL;
2590 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002591 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002592
Victor Stinner4a587072013-11-19 12:54:53 +01002593 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2594 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002595 break;
2596 }
2597
2598 case 'p':
2599 {
2600 char number[MAX_LONG_LONG_CHARS];
2601
2602 len = sprintf(number, "%p", va_arg(*vargs, void*));
2603 assert(len >= 0);
2604
2605 /* %p is ill-defined: ensure leading 0x. */
2606 if (number[1] == 'X')
2607 number[1] = 'x';
2608 else if (number[1] != 'x') {
2609 memmove(number + 2, number,
2610 strlen(number) + 1);
2611 number[0] = '0';
2612 number[1] = 'x';
2613 len += 2;
2614 }
2615
Victor Stinner4a587072013-11-19 12:54:53 +01002616 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002617 return NULL;
2618 break;
2619 }
2620
2621 case 's':
2622 {
2623 /* UTF-8 */
2624 const char *s = va_arg(*vargs, const char*);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002625 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002626 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002627 break;
2628 }
2629
2630 case 'U':
2631 {
2632 PyObject *obj = va_arg(*vargs, PyObject *);
2633 assert(obj && _PyUnicode_CHECK(obj));
2634
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002635 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002636 return NULL;
2637 break;
2638 }
2639
2640 case 'V':
2641 {
2642 PyObject *obj = va_arg(*vargs, PyObject *);
2643 const char *str = va_arg(*vargs, const char *);
Victor Stinnere215d962012-10-06 23:03:36 +02002644 if (obj) {
2645 assert(_PyUnicode_CHECK(obj));
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002646 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002647 return NULL;
2648 }
2649 else {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002650 assert(str != NULL);
2651 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002652 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002653 }
2654 break;
2655 }
2656
2657 case 'S':
2658 {
2659 PyObject *obj = va_arg(*vargs, PyObject *);
2660 PyObject *str;
2661 assert(obj);
2662 str = PyObject_Str(obj);
2663 if (!str)
2664 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002665 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002666 Py_DECREF(str);
2667 return NULL;
2668 }
2669 Py_DECREF(str);
2670 break;
2671 }
2672
2673 case 'R':
2674 {
2675 PyObject *obj = va_arg(*vargs, PyObject *);
2676 PyObject *repr;
2677 assert(obj);
2678 repr = PyObject_Repr(obj);
2679 if (!repr)
2680 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002681 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002682 Py_DECREF(repr);
2683 return NULL;
2684 }
2685 Py_DECREF(repr);
2686 break;
2687 }
2688
2689 case 'A':
2690 {
2691 PyObject *obj = va_arg(*vargs, PyObject *);
2692 PyObject *ascii;
2693 assert(obj);
2694 ascii = PyObject_ASCII(obj);
2695 if (!ascii)
2696 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002697 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002698 Py_DECREF(ascii);
2699 return NULL;
2700 }
2701 Py_DECREF(ascii);
2702 break;
2703 }
2704
2705 case '%':
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002706 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002707 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002708 break;
2709
2710 default:
2711 /* if we stumble upon an unknown formatting code, copy the rest
2712 of the format string to the output string. (we cannot just
2713 skip the code, since there's no way to know what's in the
2714 argument list) */
2715 len = strlen(p);
Victor Stinner4a587072013-11-19 12:54:53 +01002716 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002717 return NULL;
2718 f = p+len;
2719 return f;
2720 }
2721
2722 f++;
Victor Stinner96865452011-03-01 23:44:09 +00002723 return f;
2724}
2725
Walter Dörwaldd2034312007-05-18 16:29:38 +00002726PyObject *
2727PyUnicode_FromFormatV(const char *format, va_list vargs)
2728{
Victor Stinnere215d962012-10-06 23:03:36 +02002729 va_list vargs2;
2730 const char *f;
2731 _PyUnicodeWriter writer;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002732
Victor Stinner8f674cc2013-04-17 23:02:17 +02002733 _PyUnicodeWriter_Init(&writer);
2734 writer.min_length = strlen(format) + 100;
2735 writer.overallocate = 1;
Victor Stinnere215d962012-10-06 23:03:36 +02002736
2737 /* va_list may be an array (of 1 item) on some platforms (ex: AMD64).
2738 Copy it to be able to pass a reference to a subfunction. */
2739 Py_VA_COPY(vargs2, vargs);
2740
2741 for (f = format; *f; ) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002742 if (*f == '%') {
Victor Stinnere215d962012-10-06 23:03:36 +02002743 f = unicode_fromformat_arg(&writer, f, &vargs2);
2744 if (f == NULL)
2745 goto fail;
Victor Stinner1205f272010-09-11 00:54:47 +00002746 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002747 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002748 const char *p;
2749 Py_ssize_t len;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002750
Victor Stinnere215d962012-10-06 23:03:36 +02002751 p = f;
2752 do
2753 {
2754 if ((unsigned char)*p > 127) {
2755 PyErr_Format(PyExc_ValueError,
2756 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2757 "string, got a non-ASCII byte: 0x%02x",
2758 (unsigned char)*p);
2759 return NULL;
2760 }
2761 p++;
2762 }
2763 while (*p != '\0' && *p != '%');
2764 len = p - f;
2765
2766 if (*p == '\0')
2767 writer.overallocate = 0;
Victor Stinner4a587072013-11-19 12:54:53 +01002768
2769 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002770 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02002771
2772 f = p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002773 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002774 }
Victor Stinnere215d962012-10-06 23:03:36 +02002775 return _PyUnicodeWriter_Finish(&writer);
2776
2777 fail:
2778 _PyUnicodeWriter_Dealloc(&writer);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002779 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002780}
2781
Walter Dörwaldd2034312007-05-18 16:29:38 +00002782PyObject *
2783PyUnicode_FromFormat(const char *format, ...)
2784{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002785 PyObject* ret;
2786 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002787
2788#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002789 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002790#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002791 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002792#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002793 ret = PyUnicode_FromFormatV(format, vargs);
2794 va_end(vargs);
2795 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002796}
2797
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002798#ifdef HAVE_WCHAR_H
2799
Victor Stinner5593d8a2010-10-02 11:11:27 +00002800/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2801 convert a Unicode object to a wide character string.
2802
Victor Stinnerd88d9832011-09-06 02:00:05 +02002803 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002804 character) required to convert the unicode object. Ignore size argument.
2805
Victor Stinnerd88d9832011-09-06 02:00:05 +02002806 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002807 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002808 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002809static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002810unicode_aswidechar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002811 wchar_t *w,
2812 Py_ssize_t size)
2813{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002814 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002815 const wchar_t *wstr;
2816
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002817 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002818 if (wstr == NULL)
2819 return -1;
2820
Victor Stinner5593d8a2010-10-02 11:11:27 +00002821 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002822 if (size > res)
2823 size = res + 1;
2824 else
2825 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002826 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002827 return res;
2828 }
2829 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002830 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002831}
2832
2833Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002834PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002835 wchar_t *w,
2836 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002837{
2838 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002839 PyErr_BadInternalCall();
2840 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002841 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002842 return unicode_aswidechar(unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002843}
2844
Victor Stinner137c34c2010-09-29 10:25:54 +00002845wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002846PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002847 Py_ssize_t *size)
2848{
2849 wchar_t* buffer;
2850 Py_ssize_t buflen;
2851
2852 if (unicode == NULL) {
2853 PyErr_BadInternalCall();
2854 return NULL;
2855 }
2856
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002857 buflen = unicode_aswidechar(unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002858 if (buflen == -1)
2859 return NULL;
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002860 buffer = PyMem_NEW(wchar_t, buflen);
Victor Stinner137c34c2010-09-29 10:25:54 +00002861 if (buffer == NULL) {
2862 PyErr_NoMemory();
2863 return NULL;
2864 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002865 buflen = unicode_aswidechar(unicode, buffer, buflen);
Stefan Krah8528c312012-08-19 21:52:43 +02002866 if (buflen == -1) {
2867 PyMem_FREE(buffer);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002868 return NULL;
Stefan Krah8528c312012-08-19 21:52:43 +02002869 }
Victor Stinner5593d8a2010-10-02 11:11:27 +00002870 if (size != NULL)
2871 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002872 return buffer;
2873}
2874
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002875#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002876
Alexander Belopolsky40018472011-02-26 01:02:56 +00002877PyObject *
2878PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002879{
Victor Stinner8faf8212011-12-08 22:14:11 +01002880 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002881 PyErr_SetString(PyExc_ValueError,
2882 "chr() arg not in range(0x110000)");
2883 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002884 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002885
Victor Stinner985a82a2014-01-03 12:53:47 +01002886 return unicode_char((Py_UCS4)ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002887}
2888
Alexander Belopolsky40018472011-02-26 01:02:56 +00002889PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02002890PyUnicode_FromObject(PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002891{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002892 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002893 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002894 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05002895 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002896 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002897 Py_INCREF(obj);
2898 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002899 }
2900 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002901 /* For a Unicode subtype that's not a Unicode object,
2902 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002903 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002904 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002905 PyErr_Format(PyExc_TypeError,
2906 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002907 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002908 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002909}
2910
Alexander Belopolsky40018472011-02-26 01:02:56 +00002911PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02002912PyUnicode_FromEncodedObject(PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002913 const char *encoding,
2914 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002915{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002916 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002917 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002918
Guido van Rossumd57fd912000-03-10 22:53:23 +00002919 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002920 PyErr_BadInternalCall();
2921 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002922 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002923
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002924 /* Decoding bytes objects is the most common case and should be fast */
2925 if (PyBytes_Check(obj)) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02002926 if (PyBytes_GET_SIZE(obj) == 0)
2927 _Py_RETURN_UNICODE_EMPTY();
2928 v = PyUnicode_Decode(
2929 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2930 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002931 return v;
2932 }
2933
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002934 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002935 PyErr_SetString(PyExc_TypeError,
2936 "decoding str is not supported");
2937 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002938 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002939
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002940 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2941 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2942 PyErr_Format(PyExc_TypeError,
Serhiy Storchakab757c832014-12-05 22:25:22 +02002943 "coercing to str: need a bytes-like object, %.80s found",
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002944 Py_TYPE(obj)->tp_name);
2945 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002946 }
Tim Petersced69f82003-09-16 20:30:58 +00002947
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002948 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02002949 PyBuffer_Release(&buffer);
2950 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00002951 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002952
Serhiy Storchaka05997252013-01-26 12:14:02 +02002953 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002954 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002955 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002956}
2957
Victor Stinner600d3be2010-06-10 12:00:55 +00002958/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002959 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2960 1 on success. */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01002961int
2962_Py_normalize_encoding(const char *encoding,
2963 char *lower,
2964 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002965{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002966 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002967 char *l;
2968 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002969
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04002970 if (encoding == NULL) {
Victor Stinner66b32702013-11-07 23:12:23 +01002971 /* 6 == strlen("utf-8") + 1 */
Victor Stinnerdf23e302013-11-07 13:33:36 +01002972 if (lower_len < 6)
2973 return 0;
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04002974 strcpy(lower, "utf-8");
2975 return 1;
2976 }
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002977 e = encoding;
2978 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002979 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002980 while (*e) {
2981 if (l == l_end)
2982 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002983 if (Py_ISUPPER(*e)) {
2984 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002985 }
2986 else if (*e == '_') {
2987 *l++ = '-';
2988 e++;
2989 }
2990 else {
2991 *l++ = *e++;
2992 }
2993 }
2994 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002995 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002996}
2997
Alexander Belopolsky40018472011-02-26 01:02:56 +00002998PyObject *
2999PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003000 Py_ssize_t size,
3001 const char *encoding,
3002 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00003003{
3004 PyObject *buffer = NULL, *unicode;
3005 Py_buffer info;
3006 char lower[11]; /* Enough for any encoding shortcut */
3007
Fred Drakee4315f52000-05-09 19:53:39 +00003008 /* Shortcuts for common default encodings */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003009 if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003010 if ((strcmp(lower, "utf-8") == 0) ||
3011 (strcmp(lower, "utf8") == 0))
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003012 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
Victor Stinner37296e82010-06-10 13:36:23 +00003013 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003014 (strcmp(lower, "latin1") == 0) ||
Victor Stinnerfa3ba4c2013-10-29 11:34:05 +01003015 (strcmp(lower, "iso-8859-1") == 0) ||
3016 (strcmp(lower, "iso8859-1") == 0))
Victor Stinner37296e82010-06-10 13:36:23 +00003017 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003018#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00003019 else if (strcmp(lower, "mbcs") == 0)
3020 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003021#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003022 else if (strcmp(lower, "ascii") == 0)
3023 return PyUnicode_DecodeASCII(s, size, errors);
3024 else if (strcmp(lower, "utf-16") == 0)
3025 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3026 else if (strcmp(lower, "utf-32") == 0)
3027 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3028 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003029
3030 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003031 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00003032 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003033 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003034 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003035 if (buffer == NULL)
3036 goto onError;
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003037 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003038 if (unicode == NULL)
3039 goto onError;
3040 if (!PyUnicode_Check(unicode)) {
3041 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003042 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3043 "use codecs.decode() to decode to arbitrary types",
3044 encoding,
3045 Py_TYPE(unicode)->tp_name, Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003046 Py_DECREF(unicode);
3047 goto onError;
3048 }
3049 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003050 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003051
Benjamin Peterson29060642009-01-31 22:14:21 +00003052 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003053 Py_XDECREF(buffer);
3054 return NULL;
3055}
3056
Alexander Belopolsky40018472011-02-26 01:02:56 +00003057PyObject *
3058PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003059 const char *encoding,
3060 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003061{
3062 PyObject *v;
3063
3064 if (!PyUnicode_Check(unicode)) {
3065 PyErr_BadArgument();
3066 goto onError;
3067 }
3068
3069 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003070 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003071
3072 /* Decode via the codec registry */
3073 v = PyCodec_Decode(unicode, encoding, errors);
3074 if (v == NULL)
3075 goto onError;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003076 return unicode_result(v);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003077
Benjamin Peterson29060642009-01-31 22:14:21 +00003078 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003079 return NULL;
3080}
3081
Alexander Belopolsky40018472011-02-26 01:02:56 +00003082PyObject *
3083PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003084 const char *encoding,
3085 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003086{
3087 PyObject *v;
3088
3089 if (!PyUnicode_Check(unicode)) {
3090 PyErr_BadArgument();
3091 goto onError;
3092 }
3093
3094 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003095 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003096
3097 /* Decode via the codec registry */
3098 v = PyCodec_Decode(unicode, encoding, errors);
3099 if (v == NULL)
3100 goto onError;
3101 if (!PyUnicode_Check(v)) {
3102 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003103 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3104 "use codecs.decode() to decode to arbitrary types",
3105 encoding,
3106 Py_TYPE(unicode)->tp_name, Py_TYPE(unicode)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003107 Py_DECREF(v);
3108 goto onError;
3109 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003110 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003111
Benjamin Peterson29060642009-01-31 22:14:21 +00003112 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003113 return NULL;
3114}
3115
Alexander Belopolsky40018472011-02-26 01:02:56 +00003116PyObject *
3117PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003118 Py_ssize_t size,
3119 const char *encoding,
3120 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003121{
3122 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003123
Guido van Rossumd57fd912000-03-10 22:53:23 +00003124 unicode = PyUnicode_FromUnicode(s, size);
3125 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003126 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003127 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3128 Py_DECREF(unicode);
3129 return v;
3130}
3131
Alexander Belopolsky40018472011-02-26 01:02:56 +00003132PyObject *
3133PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003134 const char *encoding,
3135 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003136{
3137 PyObject *v;
3138
3139 if (!PyUnicode_Check(unicode)) {
3140 PyErr_BadArgument();
3141 goto onError;
3142 }
3143
3144 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003145 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003146
3147 /* Encode via the codec registry */
3148 v = PyCodec_Encode(unicode, encoding, errors);
3149 if (v == NULL)
3150 goto onError;
3151 return v;
3152
Benjamin Peterson29060642009-01-31 22:14:21 +00003153 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003154 return NULL;
3155}
3156
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003157static size_t
3158wcstombs_errorpos(const wchar_t *wstr)
3159{
3160 size_t len;
3161#if SIZEOF_WCHAR_T == 2
3162 wchar_t buf[3];
3163#else
3164 wchar_t buf[2];
3165#endif
3166 char outbuf[MB_LEN_MAX];
3167 const wchar_t *start, *previous;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003168
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003169#if SIZEOF_WCHAR_T == 2
3170 buf[2] = 0;
3171#else
3172 buf[1] = 0;
3173#endif
3174 start = wstr;
3175 while (*wstr != L'\0')
3176 {
3177 previous = wstr;
3178#if SIZEOF_WCHAR_T == 2
3179 if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
3180 && Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
3181 {
3182 buf[0] = wstr[0];
3183 buf[1] = wstr[1];
3184 wstr += 2;
3185 }
3186 else {
3187 buf[0] = *wstr;
3188 buf[1] = 0;
3189 wstr++;
3190 }
3191#else
3192 buf[0] = *wstr;
3193 wstr++;
3194#endif
3195 len = wcstombs(outbuf, buf, sizeof(outbuf));
Victor Stinner2f197072011-12-17 07:08:30 +01003196 if (len == (size_t)-1)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003197 return previous - start;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003198 }
3199
3200 /* failed to find the unencodable character */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003201 return 0;
3202}
3203
Victor Stinner1b579672011-12-17 05:47:23 +01003204static int
3205locale_error_handler(const char *errors, int *surrogateescape)
3206{
3207 if (errors == NULL) {
3208 *surrogateescape = 0;
3209 return 0;
3210 }
3211
3212 if (strcmp(errors, "strict") == 0) {
3213 *surrogateescape = 0;
3214 return 0;
3215 }
Victor Stinner8dbd4212012-12-04 09:30:24 +01003216 if (strcmp(errors, "surrogateescape") == 0) {
Victor Stinner1b579672011-12-17 05:47:23 +01003217 *surrogateescape = 1;
3218 return 0;
3219 }
3220 PyErr_Format(PyExc_ValueError,
3221 "only 'strict' and 'surrogateescape' error handlers "
3222 "are supported, not '%s'",
3223 errors);
3224 return -1;
3225}
3226
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003227PyObject *
Victor Stinner1b579672011-12-17 05:47:23 +01003228PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003229{
3230 Py_ssize_t wlen, wlen2;
3231 wchar_t *wstr;
3232 PyObject *bytes = NULL;
3233 char *errmsg;
Raymond Hettingere56666d2013-08-04 11:51:03 -07003234 PyObject *reason = NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003235 PyObject *exc;
3236 size_t error_pos;
Victor Stinner1b579672011-12-17 05:47:23 +01003237 int surrogateescape;
3238
3239 if (locale_error_handler(errors, &surrogateescape) < 0)
3240 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003241
3242 wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3243 if (wstr == NULL)
3244 return NULL;
3245
3246 wlen2 = wcslen(wstr);
3247 if (wlen2 != wlen) {
3248 PyMem_Free(wstr);
3249 PyErr_SetString(PyExc_TypeError, "embedded null character");
3250 return NULL;
3251 }
3252
3253 if (surrogateescape) {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003254 /* "surrogateescape" error handler */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003255 char *str;
3256
3257 str = _Py_wchar2char(wstr, &error_pos);
3258 if (str == NULL) {
3259 if (error_pos == (size_t)-1) {
3260 PyErr_NoMemory();
3261 PyMem_Free(wstr);
3262 return NULL;
3263 }
3264 else {
3265 goto encode_error;
3266 }
3267 }
3268 PyMem_Free(wstr);
3269
3270 bytes = PyBytes_FromString(str);
3271 PyMem_Free(str);
3272 }
3273 else {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003274 /* strict mode */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003275 size_t len, len2;
3276
3277 len = wcstombs(NULL, wstr, 0);
3278 if (len == (size_t)-1) {
Victor Stinner2f197072011-12-17 07:08:30 +01003279 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003280 goto encode_error;
3281 }
3282
3283 bytes = PyBytes_FromStringAndSize(NULL, len);
3284 if (bytes == NULL) {
3285 PyMem_Free(wstr);
3286 return NULL;
3287 }
3288
3289 len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
3290 if (len2 == (size_t)-1 || len2 > len) {
Victor Stinner2f197072011-12-17 07:08:30 +01003291 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003292 goto encode_error;
3293 }
3294 PyMem_Free(wstr);
3295 }
3296 return bytes;
3297
3298encode_error:
3299 errmsg = strerror(errno);
3300 assert(errmsg != NULL);
Victor Stinner2f197072011-12-17 07:08:30 +01003301
3302 if (error_pos == (size_t)-1)
3303 error_pos = wcstombs_errorpos(wstr);
3304
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003305 PyMem_Free(wstr);
3306 Py_XDECREF(bytes);
3307
Victor Stinner2f197072011-12-17 07:08:30 +01003308 if (errmsg != NULL) {
3309 size_t errlen;
3310 wstr = _Py_char2wchar(errmsg, &errlen);
3311 if (wstr != NULL) {
3312 reason = PyUnicode_FromWideChar(wstr, errlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003313 PyMem_RawFree(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003314 } else
3315 errmsg = NULL;
3316 }
3317 if (errmsg == NULL)
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003318 reason = PyUnicode_FromString(
3319 "wcstombs() encountered an unencodable "
3320 "wide character");
3321 if (reason == NULL)
3322 return NULL;
3323
3324 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO",
3325 "locale", unicode,
3326 (Py_ssize_t)error_pos,
3327 (Py_ssize_t)(error_pos+1),
3328 reason);
3329 Py_DECREF(reason);
3330 if (exc != NULL) {
3331 PyCodec_StrictErrors(exc);
3332 Py_XDECREF(exc);
3333 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003334 return NULL;
3335}
3336
Victor Stinnerad158722010-10-27 00:25:46 +00003337PyObject *
3338PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003339{
Victor Stinner99b95382011-07-04 14:23:54 +02003340#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003341 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003342#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003343 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00003344#else
Victor Stinner793b5312011-04-27 00:24:21 +02003345 PyInterpreterState *interp = PyThreadState_GET()->interp;
3346 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3347 cannot use it to encode and decode filenames before it is loaded. Load
3348 the Python codec requires to encode at least its own filename. Use the C
3349 version of the locale codec until the codec registry is initialized and
3350 the Python codec is loaded.
3351
3352 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3353 cannot only rely on it: check also interp->fscodec_initialized for
3354 subinterpreters. */
3355 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003356 return PyUnicode_AsEncodedString(unicode,
3357 Py_FileSystemDefaultEncoding,
3358 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003359 }
3360 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003361 return PyUnicode_EncodeLocale(unicode, "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003362 }
Victor Stinnerad158722010-10-27 00:25:46 +00003363#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003364}
3365
Alexander Belopolsky40018472011-02-26 01:02:56 +00003366PyObject *
3367PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003368 const char *encoding,
3369 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003370{
3371 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00003372 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003373
Guido van Rossumd57fd912000-03-10 22:53:23 +00003374 if (!PyUnicode_Check(unicode)) {
3375 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003376 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003377 }
Fred Drakee4315f52000-05-09 19:53:39 +00003378
Fred Drakee4315f52000-05-09 19:53:39 +00003379 /* Shortcuts for common default encodings */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003380 if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003381 if ((strcmp(lower, "utf-8") == 0) ||
3382 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00003383 {
Victor Stinner2f283c22011-03-02 01:21:46 +00003384 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003385 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00003386 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003387 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00003388 }
Victor Stinner37296e82010-06-10 13:36:23 +00003389 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003390 (strcmp(lower, "latin1") == 0) ||
Victor Stinnerfa3ba4c2013-10-29 11:34:05 +01003391 (strcmp(lower, "iso-8859-1") == 0) ||
3392 (strcmp(lower, "iso8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003393 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003394#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003395 else if (strcmp(lower, "mbcs") == 0)
3396 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003397#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003398 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003399 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00003400 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003401
3402 /* Encode via the codec registry */
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003403 v = _PyCodec_EncodeText(unicode, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003404 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003405 return NULL;
3406
3407 /* The normal path */
3408 if (PyBytes_Check(v))
3409 return v;
3410
3411 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003412 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003413 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003414 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003415
3416 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003417 "encoder %s returned bytearray instead of bytes; "
3418 "use codecs.encode() to encode to arbitrary types",
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003419 encoding);
3420 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003421 Py_DECREF(v);
3422 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003423 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003424
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003425 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3426 Py_DECREF(v);
3427 return b;
3428 }
3429
3430 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003431 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
3432 "use codecs.encode() to encode to arbitrary types",
3433 encoding,
3434 Py_TYPE(v)->tp_name, Py_TYPE(v)->tp_name);
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003435 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003436 return NULL;
3437}
3438
Alexander Belopolsky40018472011-02-26 01:02:56 +00003439PyObject *
3440PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003441 const char *encoding,
3442 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003443{
3444 PyObject *v;
3445
3446 if (!PyUnicode_Check(unicode)) {
3447 PyErr_BadArgument();
3448 goto onError;
3449 }
3450
3451 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003452 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003453
3454 /* Encode via the codec registry */
3455 v = PyCodec_Encode(unicode, encoding, errors);
3456 if (v == NULL)
3457 goto onError;
3458 if (!PyUnicode_Check(v)) {
3459 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003460 "'%.400s' encoder returned '%.400s' instead of 'str'; "
3461 "use codecs.encode() to encode to arbitrary types",
3462 encoding,
3463 Py_TYPE(v)->tp_name, Py_TYPE(v)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003464 Py_DECREF(v);
3465 goto onError;
3466 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003467 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003468
Benjamin Peterson29060642009-01-31 22:14:21 +00003469 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003470 return NULL;
3471}
3472
Victor Stinner2f197072011-12-17 07:08:30 +01003473static size_t
3474mbstowcs_errorpos(const char *str, size_t len)
3475{
3476#ifdef HAVE_MBRTOWC
3477 const char *start = str;
3478 mbstate_t mbs;
3479 size_t converted;
3480 wchar_t ch;
3481
3482 memset(&mbs, 0, sizeof mbs);
3483 while (len)
3484 {
3485 converted = mbrtowc(&ch, (char*)str, len, &mbs);
3486 if (converted == 0)
3487 /* Reached end of string */
3488 break;
3489 if (converted == (size_t)-1 || converted == (size_t)-2) {
3490 /* Conversion error or incomplete character */
3491 return str - start;
3492 }
3493 else {
3494 str += converted;
3495 len -= converted;
3496 }
3497 }
3498 /* failed to find the undecodable byte sequence */
3499 return 0;
3500#endif
3501 return 0;
3502}
3503
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003504PyObject*
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003505PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
Victor Stinner1b579672011-12-17 05:47:23 +01003506 const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003507{
3508 wchar_t smallbuf[256];
3509 size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
3510 wchar_t *wstr;
3511 size_t wlen, wlen2;
3512 PyObject *unicode;
Victor Stinner1b579672011-12-17 05:47:23 +01003513 int surrogateescape;
Victor Stinner2f197072011-12-17 07:08:30 +01003514 size_t error_pos;
3515 char *errmsg;
3516 PyObject *reason, *exc;
Victor Stinner1b579672011-12-17 05:47:23 +01003517
3518 if (locale_error_handler(errors, &surrogateescape) < 0)
3519 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003520
3521 if (str[len] != '\0' || len != strlen(str)) {
3522 PyErr_SetString(PyExc_TypeError, "embedded null character");
3523 return NULL;
3524 }
3525
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003526 if (surrogateescape) {
3527 /* "surrogateescape" error handler */
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003528 wstr = _Py_char2wchar(str, &wlen);
3529 if (wstr == NULL) {
3530 if (wlen == (size_t)-1)
3531 PyErr_NoMemory();
3532 else
3533 PyErr_SetFromErrno(PyExc_OSError);
3534 return NULL;
3535 }
3536
3537 unicode = PyUnicode_FromWideChar(wstr, wlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003538 PyMem_RawFree(wstr);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003539 }
3540 else {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003541 /* strict mode */
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003542#ifndef HAVE_BROKEN_MBSTOWCS
3543 wlen = mbstowcs(NULL, str, 0);
3544#else
3545 wlen = len;
3546#endif
Victor Stinner2f197072011-12-17 07:08:30 +01003547 if (wlen == (size_t)-1)
3548 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003549 if (wlen+1 <= smallbuf_len) {
3550 wstr = smallbuf;
3551 }
3552 else {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02003553 wstr = PyMem_New(wchar_t, wlen+1);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003554 if (!wstr)
3555 return PyErr_NoMemory();
3556 }
3557
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003558 wlen2 = mbstowcs(wstr, str, wlen+1);
3559 if (wlen2 == (size_t)-1) {
3560 if (wstr != smallbuf)
3561 PyMem_Free(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003562 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003563 }
3564#ifdef HAVE_BROKEN_MBSTOWCS
3565 assert(wlen2 == wlen);
3566#endif
3567 unicode = PyUnicode_FromWideChar(wstr, wlen2);
3568 if (wstr != smallbuf)
3569 PyMem_Free(wstr);
3570 }
3571 return unicode;
Victor Stinner2f197072011-12-17 07:08:30 +01003572
3573decode_error:
Antoine Pitrouf6d1f1f2015-05-19 21:04:33 +02003574 reason = NULL;
Victor Stinner2f197072011-12-17 07:08:30 +01003575 errmsg = strerror(errno);
3576 assert(errmsg != NULL);
3577
3578 error_pos = mbstowcs_errorpos(str, len);
3579 if (errmsg != NULL) {
3580 size_t errlen;
3581 wstr = _Py_char2wchar(errmsg, &errlen);
3582 if (wstr != NULL) {
3583 reason = PyUnicode_FromWideChar(wstr, errlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003584 PyMem_RawFree(wstr);
Antoine Pitrouf6d1f1f2015-05-19 21:04:33 +02003585 }
Victor Stinner2f197072011-12-17 07:08:30 +01003586 }
Antoine Pitrouf6d1f1f2015-05-19 21:04:33 +02003587 if (reason == NULL)
Victor Stinner2f197072011-12-17 07:08:30 +01003588 reason = PyUnicode_FromString(
3589 "mbstowcs() encountered an invalid multibyte sequence");
3590 if (reason == NULL)
3591 return NULL;
3592
3593 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO",
3594 "locale", str, len,
3595 (Py_ssize_t)error_pos,
3596 (Py_ssize_t)(error_pos+1),
3597 reason);
3598 Py_DECREF(reason);
3599 if (exc != NULL) {
3600 PyCodec_StrictErrors(exc);
3601 Py_XDECREF(exc);
3602 }
3603 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003604}
3605
3606PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003607PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003608{
3609 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner1b579672011-12-17 05:47:23 +01003610 return PyUnicode_DecodeLocaleAndSize(str, size, errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003611}
3612
3613
3614PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003615PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003616 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003617 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3618}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003619
Christian Heimes5894ba72007-11-04 11:43:14 +00003620PyObject*
3621PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3622{
Victor Stinner99b95382011-07-04 14:23:54 +02003623#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003624 return PyUnicode_DecodeMBCS(s, size, NULL);
3625#elif defined(__APPLE__)
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003626 return PyUnicode_DecodeUTF8Stateful(s, size, "surrogateescape", NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003627#else
Victor Stinner793b5312011-04-27 00:24:21 +02003628 PyInterpreterState *interp = PyThreadState_GET()->interp;
3629 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3630 cannot use it to encode and decode filenames before it is loaded. Load
3631 the Python codec requires to encode at least its own filename. Use the C
3632 version of the locale codec until the codec registry is initialized and
3633 the Python codec is loaded.
3634
3635 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3636 cannot only rely on it: check also interp->fscodec_initialized for
3637 subinterpreters. */
3638 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003639 return PyUnicode_Decode(s, size,
3640 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00003641 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003642 }
3643 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003644 return PyUnicode_DecodeLocaleAndSize(s, size, "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003645 }
Victor Stinnerad158722010-10-27 00:25:46 +00003646#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003647}
3648
Martin v. Löwis011e8422009-05-05 04:43:17 +00003649
3650int
Victor Stinnerfe75fb42012-10-23 02:52:18 +02003651_PyUnicode_HasNULChars(PyObject* str)
Antoine Pitrou13348842012-01-29 18:36:34 +01003652{
Victor Stinnerfe75fb42012-10-23 02:52:18 +02003653 Py_ssize_t pos;
Antoine Pitrou13348842012-01-29 18:36:34 +01003654
Victor Stinnerfe75fb42012-10-23 02:52:18 +02003655 if (PyUnicode_READY(str) == -1)
Antoine Pitrou13348842012-01-29 18:36:34 +01003656 return -1;
Victor Stinnerfe75fb42012-10-23 02:52:18 +02003657 pos = findchar(PyUnicode_DATA(str), PyUnicode_KIND(str),
3658 PyUnicode_GET_LENGTH(str), '\0', 1);
3659 if (pos == -1)
3660 return 0;
3661 else
3662 return 1;
Antoine Pitrou13348842012-01-29 18:36:34 +01003663}
3664
Antoine Pitrou13348842012-01-29 18:36:34 +01003665int
Martin v. Löwis011e8422009-05-05 04:43:17 +00003666PyUnicode_FSConverter(PyObject* arg, void* addr)
3667{
3668 PyObject *output = NULL;
3669 Py_ssize_t size;
3670 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003671 if (arg == NULL) {
3672 Py_DECREF(*(PyObject**)addr);
3673 return 1;
3674 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00003675 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00003676 output = arg;
3677 Py_INCREF(output);
3678 }
3679 else {
3680 arg = PyUnicode_FromObject(arg);
3681 if (!arg)
3682 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00003683 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003684 Py_DECREF(arg);
3685 if (!output)
3686 return 0;
3687 if (!PyBytes_Check(output)) {
3688 Py_DECREF(output);
3689 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3690 return 0;
3691 }
3692 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00003693 size = PyBytes_GET_SIZE(output);
3694 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003695 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05003696 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003697 Py_DECREF(output);
3698 return 0;
3699 }
3700 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003701 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003702}
3703
3704
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003705int
3706PyUnicode_FSDecoder(PyObject* arg, void* addr)
3707{
3708 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003709 if (arg == NULL) {
3710 Py_DECREF(*(PyObject**)addr);
3711 return 1;
3712 }
3713 if (PyUnicode_Check(arg)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003714 if (PyUnicode_READY(arg) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003715 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003716 output = arg;
3717 Py_INCREF(output);
3718 }
3719 else {
3720 arg = PyBytes_FromObject(arg);
3721 if (!arg)
3722 return 0;
3723 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3724 PyBytes_GET_SIZE(arg));
3725 Py_DECREF(arg);
3726 if (!output)
3727 return 0;
3728 if (!PyUnicode_Check(output)) {
3729 Py_DECREF(output);
3730 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3731 return 0;
3732 }
3733 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003734 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003735 Py_DECREF(output);
3736 return 0;
3737 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003738 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003739 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003740 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3741 Py_DECREF(output);
3742 return 0;
3743 }
3744 *(PyObject**)addr = output;
3745 return Py_CLEANUP_SUPPORTED;
3746}
3747
3748
Martin v. Löwis5b222132007-06-10 09:51:05 +00003749char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003750PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003751{
Christian Heimesf3863112007-11-22 07:46:41 +00003752 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003753
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003754 if (!PyUnicode_Check(unicode)) {
3755 PyErr_BadArgument();
3756 return NULL;
3757 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003758 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003759 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003760
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003761 if (PyUnicode_UTF8(unicode) == NULL) {
3762 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003763 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3764 if (bytes == NULL)
3765 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003766 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3767 if (_PyUnicode_UTF8(unicode) == NULL) {
Victor Stinnera5afb582013-10-29 01:28:23 +01003768 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003769 Py_DECREF(bytes);
3770 return NULL;
3771 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003772 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3773 Py_MEMCPY(_PyUnicode_UTF8(unicode),
3774 PyBytes_AS_STRING(bytes),
3775 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003776 Py_DECREF(bytes);
3777 }
3778
3779 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003780 *psize = PyUnicode_UTF8_LENGTH(unicode);
3781 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003782}
3783
3784char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003785PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003786{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003787 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3788}
3789
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003790Py_UNICODE *
3791PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3792{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003793 const unsigned char *one_byte;
3794#if SIZEOF_WCHAR_T == 4
3795 const Py_UCS2 *two_bytes;
3796#else
3797 const Py_UCS4 *four_bytes;
3798 const Py_UCS4 *ucs4_end;
3799 Py_ssize_t num_surrogates;
3800#endif
3801 wchar_t *w;
3802 wchar_t *wchar_end;
3803
3804 if (!PyUnicode_Check(unicode)) {
3805 PyErr_BadArgument();
3806 return NULL;
3807 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003808 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003809 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003810 assert(_PyUnicode_KIND(unicode) != 0);
3811 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003812
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003813 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003814#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003815 four_bytes = PyUnicode_4BYTE_DATA(unicode);
3816 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003817 num_surrogates = 0;
3818
3819 for (; four_bytes < ucs4_end; ++four_bytes) {
3820 if (*four_bytes > 0xFFFF)
3821 ++num_surrogates;
3822 }
3823
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003824 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
3825 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
3826 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003827 PyErr_NoMemory();
3828 return NULL;
3829 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003830 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003831
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003832 w = _PyUnicode_WSTR(unicode);
3833 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
3834 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003835 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3836 if (*four_bytes > 0xFFFF) {
Victor Stinner8faf8212011-12-08 22:14:11 +01003837 assert(*four_bytes <= MAX_UNICODE);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003838 /* encode surrogate pair in this case */
Victor Stinner551ac952011-11-29 22:58:13 +01003839 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
3840 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003841 }
3842 else
3843 *w = *four_bytes;
3844
3845 if (w > wchar_end) {
3846 assert(0 && "Miscalculated string end");
3847 }
3848 }
3849 *w = 0;
3850#else
3851 /* sizeof(wchar_t) == 4 */
3852 Py_FatalError("Impossible unicode object state, wstr and str "
3853 "should share memory already.");
3854 return NULL;
3855#endif
3856 }
3857 else {
Serhiy Storchakae55181f2015-02-20 21:34:06 +02003858 if ((size_t)_PyUnicode_LENGTH(unicode) >
3859 PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
3860 PyErr_NoMemory();
3861 return NULL;
3862 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003863 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3864 (_PyUnicode_LENGTH(unicode) + 1));
3865 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003866 PyErr_NoMemory();
3867 return NULL;
3868 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003869 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
3870 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
3871 w = _PyUnicode_WSTR(unicode);
3872 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003873
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003874 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3875 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003876 for (; w < wchar_end; ++one_byte, ++w)
3877 *w = *one_byte;
3878 /* null-terminate the wstr */
3879 *w = 0;
3880 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003881 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003882#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003883 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003884 for (; w < wchar_end; ++two_bytes, ++w)
3885 *w = *two_bytes;
3886 /* null-terminate the wstr */
3887 *w = 0;
3888#else
3889 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003890 PyObject_FREE(_PyUnicode_WSTR(unicode));
3891 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003892 Py_FatalError("Impossible unicode object state, wstr "
3893 "and str should share memory already.");
3894 return NULL;
3895#endif
3896 }
3897 else {
3898 assert(0 && "This should never happen.");
3899 }
3900 }
3901 }
3902 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003903 *size = PyUnicode_WSTR_LENGTH(unicode);
3904 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003905}
3906
Alexander Belopolsky40018472011-02-26 01:02:56 +00003907Py_UNICODE *
3908PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003909{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003910 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003911}
3912
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003913
Alexander Belopolsky40018472011-02-26 01:02:56 +00003914Py_ssize_t
3915PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003916{
3917 if (!PyUnicode_Check(unicode)) {
3918 PyErr_BadArgument();
3919 goto onError;
3920 }
3921 return PyUnicode_GET_SIZE(unicode);
3922
Benjamin Peterson29060642009-01-31 22:14:21 +00003923 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003924 return -1;
3925}
3926
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003927Py_ssize_t
3928PyUnicode_GetLength(PyObject *unicode)
3929{
Victor Stinner07621332012-06-16 04:53:46 +02003930 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003931 PyErr_BadArgument();
3932 return -1;
3933 }
Victor Stinner07621332012-06-16 04:53:46 +02003934 if (PyUnicode_READY(unicode) == -1)
3935 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003936 return PyUnicode_GET_LENGTH(unicode);
3937}
3938
3939Py_UCS4
3940PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3941{
Victor Stinner69ed0f42013-04-09 21:48:24 +02003942 void *data;
3943 int kind;
3944
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003945 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3946 PyErr_BadArgument();
3947 return (Py_UCS4)-1;
3948 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01003949 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003950 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003951 return (Py_UCS4)-1;
3952 }
Victor Stinner69ed0f42013-04-09 21:48:24 +02003953 data = PyUnicode_DATA(unicode);
3954 kind = PyUnicode_KIND(unicode);
3955 return PyUnicode_READ(kind, data, index);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003956}
3957
3958int
3959PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3960{
3961 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003962 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003963 return -1;
3964 }
Victor Stinner488fa492011-12-12 00:01:39 +01003965 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01003966 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003967 PyErr_SetString(PyExc_IndexError, "string index out of range");
3968 return -1;
3969 }
Victor Stinner488fa492011-12-12 00:01:39 +01003970 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02003971 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01003972 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
3973 PyErr_SetString(PyExc_ValueError, "character out of range");
3974 return -1;
3975 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003976 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3977 index, ch);
3978 return 0;
3979}
3980
Alexander Belopolsky40018472011-02-26 01:02:56 +00003981const char *
3982PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00003983{
Victor Stinner42cb4622010-09-01 19:39:01 +00003984 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00003985}
3986
Victor Stinner554f3f02010-06-16 23:33:54 +00003987/* create or adjust a UnicodeDecodeError */
3988static void
3989make_decode_exception(PyObject **exceptionObject,
3990 const char *encoding,
3991 const char *input, Py_ssize_t length,
3992 Py_ssize_t startpos, Py_ssize_t endpos,
3993 const char *reason)
3994{
3995 if (*exceptionObject == NULL) {
3996 *exceptionObject = PyUnicodeDecodeError_Create(
3997 encoding, input, length, startpos, endpos, reason);
3998 }
3999 else {
4000 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4001 goto onError;
4002 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4003 goto onError;
4004 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4005 goto onError;
4006 }
4007 return;
4008
4009onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02004010 Py_CLEAR(*exceptionObject);
Victor Stinner554f3f02010-06-16 23:33:54 +00004011}
4012
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004013#ifdef HAVE_MBCS
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004014/* error handling callback helper:
4015 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00004016 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004017 and adjust various state variables.
4018 return 0 on success, -1 on error
4019*/
4020
Alexander Belopolsky40018472011-02-26 01:02:56 +00004021static int
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004022unicode_decode_call_errorhandler_wchar(
4023 const char *errors, PyObject **errorHandler,
4024 const char *encoding, const char *reason,
4025 const char **input, const char **inend, Py_ssize_t *startinpos,
4026 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4027 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004028{
Benjamin Peterson142957c2008-07-04 19:55:29 +00004029 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004030
4031 PyObject *restuple = NULL;
4032 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01004033 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004034 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004035 Py_ssize_t requiredsize;
4036 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004037 PyObject *inputobj = NULL;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004038 wchar_t *repwstr;
4039 Py_ssize_t repwlen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004040
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004041 assert (_PyUnicode_KIND(*output) == PyUnicode_WCHAR_KIND);
4042 outsize = _PyUnicode_WSTR_LENGTH(*output);
Victor Stinner596a6c42011-11-09 00:02:18 +01004043
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004044 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004045 *errorHandler = PyCodec_LookupError(errors);
4046 if (*errorHandler == NULL)
4047 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004048 }
4049
Victor Stinner554f3f02010-06-16 23:33:54 +00004050 make_decode_exception(exceptionObject,
4051 encoding,
4052 *input, *inend - *input,
4053 *startinpos, *endinpos,
4054 reason);
4055 if (*exceptionObject == NULL)
4056 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004057
4058 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4059 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004060 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004061 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00004062 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004063 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004064 }
4065 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004066 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004067
4068 /* Copy back the bytes variables, which might have been modified by the
4069 callback */
4070 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4071 if (!inputobj)
4072 goto onError;
4073 if (!PyBytes_Check(inputobj)) {
4074 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
4075 }
4076 *input = PyBytes_AS_STRING(inputobj);
4077 insize = PyBytes_GET_SIZE(inputobj);
4078 *inend = *input + insize;
4079 /* we can DECREF safely, as the exception has another reference,
4080 so the object won't go away. */
4081 Py_DECREF(inputobj);
4082
4083 if (newpos<0)
4084 newpos = insize+newpos;
4085 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004086 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004087 goto onError;
4088 }
4089
4090 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4091 if (repwstr == NULL)
4092 goto onError;
4093 /* need more space? (at least enough for what we
4094 have+the replacement+the rest of the string (starting
4095 at the new input position), so we won't have to check space
4096 when there are no errors in the rest of the string) */
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004097 requiredsize = *outpos;
4098 if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4099 goto overflow;
4100 requiredsize += repwlen;
4101 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4102 goto overflow;
4103 requiredsize += insize - newpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004104 if (requiredsize > outsize) {
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004105 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004106 requiredsize = 2*outsize;
4107 if (unicode_resize(output, requiredsize) < 0)
4108 goto onError;
4109 }
4110 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4111 *outpos += repwlen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004112 *endinpos = newpos;
4113 *inptr = *input + newpos;
4114
4115 /* we made it! */
4116 Py_XDECREF(restuple);
4117 return 0;
4118
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004119 overflow:
4120 PyErr_SetString(PyExc_OverflowError,
4121 "decoded result is too long for a Python string");
4122
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004123 onError:
4124 Py_XDECREF(restuple);
4125 return -1;
4126}
4127#endif /* HAVE_MBCS */
4128
4129static int
4130unicode_decode_call_errorhandler_writer(
4131 const char *errors, PyObject **errorHandler,
4132 const char *encoding, const char *reason,
4133 const char **input, const char **inend, Py_ssize_t *startinpos,
4134 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4135 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4136{
4137 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
4138
4139 PyObject *restuple = NULL;
4140 PyObject *repunicode = NULL;
4141 Py_ssize_t insize;
4142 Py_ssize_t newpos;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004143 Py_ssize_t replen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004144 PyObject *inputobj = NULL;
4145
4146 if (*errorHandler == NULL) {
4147 *errorHandler = PyCodec_LookupError(errors);
4148 if (*errorHandler == NULL)
4149 goto onError;
4150 }
4151
4152 make_decode_exception(exceptionObject,
4153 encoding,
4154 *input, *inend - *input,
4155 *startinpos, *endinpos,
4156 reason);
4157 if (*exceptionObject == NULL)
4158 goto onError;
4159
4160 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4161 if (restuple == NULL)
4162 goto onError;
4163 if (!PyTuple_Check(restuple)) {
4164 PyErr_SetString(PyExc_TypeError, &argparse[4]);
4165 goto onError;
4166 }
4167 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004168 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004169
4170 /* Copy back the bytes variables, which might have been modified by the
4171 callback */
4172 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4173 if (!inputobj)
4174 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00004175 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004176 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00004177 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004178 *input = PyBytes_AS_STRING(inputobj);
4179 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004180 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004181 /* we can DECREF safely, as the exception has another reference,
4182 so the object won't go away. */
4183 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004184
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004185 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004186 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004187 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004188 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00004189 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004190 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004191
Victor Stinner8f674cc2013-04-17 23:02:17 +02004192 if (PyUnicode_READY(repunicode) < 0)
4193 goto onError;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004194 replen = PyUnicode_GET_LENGTH(repunicode);
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004195 if (replen > 1) {
4196 writer->min_length += replen - 1;
Victor Stinner8f674cc2013-04-17 23:02:17 +02004197 writer->overallocate = 1;
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004198 if (_PyUnicodeWriter_Prepare(writer, writer->min_length,
4199 PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4200 goto onError;
4201 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004202 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
Victor Stinner376cfa12013-04-17 23:58:16 +02004203 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004204
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004205 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004206 *inptr = *input + newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004207
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004208 /* we made it! */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004209 Py_XDECREF(restuple);
4210 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004211
Benjamin Peterson29060642009-01-31 22:14:21 +00004212 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004213 Py_XDECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004214 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004215}
4216
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004217/* --- UTF-7 Codec -------------------------------------------------------- */
4218
Antoine Pitrou244651a2009-05-04 18:56:13 +00004219/* See RFC2152 for details. We encode conservatively and decode liberally. */
4220
4221/* Three simple macros defining base-64. */
4222
4223/* Is c a base-64 character? */
4224
4225#define IS_BASE64(c) \
4226 (((c) >= 'A' && (c) <= 'Z') || \
4227 ((c) >= 'a' && (c) <= 'z') || \
4228 ((c) >= '0' && (c) <= '9') || \
4229 (c) == '+' || (c) == '/')
4230
4231/* given that c is a base-64 character, what is its base-64 value? */
4232
4233#define FROM_BASE64(c) \
4234 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4235 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4236 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4237 (c) == '+' ? 62 : 63)
4238
4239/* What is the base-64 character of the bottom 6 bits of n? */
4240
4241#define TO_BASE64(n) \
4242 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4243
4244/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4245 * decoded as itself. We are permissive on decoding; the only ASCII
4246 * byte not decoding to itself is the + which begins a base64
4247 * string. */
4248
4249#define DECODE_DIRECT(c) \
4250 ((c) <= 127 && (c) != '+')
4251
4252/* The UTF-7 encoder treats ASCII characters differently according to
4253 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4254 * the above). See RFC2152. This array identifies these different
4255 * sets:
4256 * 0 : "Set D"
4257 * alphanumeric and '(),-./:?
4258 * 1 : "Set O"
4259 * !"#$%&*;<=>@[]^_`{|}
4260 * 2 : "whitespace"
4261 * ht nl cr sp
4262 * 3 : special (must be base64 encoded)
4263 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4264 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004265
Tim Petersced69f82003-09-16 20:30:58 +00004266static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004267char utf7_category[128] = {
4268/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4269 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4270/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4271 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4272/* sp ! " # $ % & ' ( ) * + , - . / */
4273 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4274/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4275 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4276/* @ A B C D E F G H I J K L M N O */
4277 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4278/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4279 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4280/* ` a b c d e f g h i j k l m n o */
4281 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4282/* p q r s t u v w x y z { | } ~ del */
4283 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004284};
4285
Antoine Pitrou244651a2009-05-04 18:56:13 +00004286/* ENCODE_DIRECT: this character should be encoded as itself. The
4287 * answer depends on whether we are encoding set O as itself, and also
4288 * on whether we are encoding whitespace as itself. RFC2152 makes it
4289 * clear that the answers to these questions vary between
4290 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004291
Antoine Pitrou244651a2009-05-04 18:56:13 +00004292#define ENCODE_DIRECT(c, directO, directWS) \
4293 ((c) < 128 && (c) > 0 && \
4294 ((utf7_category[(c)] == 0) || \
4295 (directWS && (utf7_category[(c)] == 2)) || \
4296 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004297
Alexander Belopolsky40018472011-02-26 01:02:56 +00004298PyObject *
4299PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004300 Py_ssize_t size,
4301 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004302{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004303 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4304}
4305
Antoine Pitrou244651a2009-05-04 18:56:13 +00004306/* The decoder. The only state we preserve is our read position,
4307 * i.e. how many characters we have consumed. So if we end in the
4308 * middle of a shift sequence we have to back off the read position
4309 * and the output to the beginning of the sequence, otherwise we lose
4310 * all the shift state (seen bits, number of bits seen, high
4311 * surrogate). */
4312
Alexander Belopolsky40018472011-02-26 01:02:56 +00004313PyObject *
4314PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004315 Py_ssize_t size,
4316 const char *errors,
4317 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004318{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004319 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004320 Py_ssize_t startinpos;
4321 Py_ssize_t endinpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004322 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004323 _PyUnicodeWriter writer;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004324 const char *errmsg = "";
4325 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004326 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004327 unsigned int base64bits = 0;
4328 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004329 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004330 PyObject *errorHandler = NULL;
4331 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004332
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004333 if (size == 0) {
4334 if (consumed)
4335 *consumed = 0;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004336 _Py_RETURN_UNICODE_EMPTY();
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004337 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004338
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004339 /* Start off assuming it's all ASCII. Widen later as necessary. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02004340 _PyUnicodeWriter_Init(&writer);
4341 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004342
4343 shiftOutStart = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004344 e = s + size;
4345
4346 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004347 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004348 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004349 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004350
Antoine Pitrou244651a2009-05-04 18:56:13 +00004351 if (inShift) { /* in a base-64 section */
4352 if (IS_BASE64(ch)) { /* consume a base-64 character */
4353 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4354 base64bits += 6;
4355 s++;
4356 if (base64bits >= 16) {
4357 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004358 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004359 base64bits -= 16;
4360 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004361 assert(outCh <= 0xffff);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004362 if (surrogate) {
4363 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004364 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4365 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004366 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004367 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004368 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004369 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004370 }
4371 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004372 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004373 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004374 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004375 }
4376 }
Victor Stinner551ac952011-11-29 22:58:13 +01004377 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004378 /* first surrogate */
4379 surrogate = outCh;
4380 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004381 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004382 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004383 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004384 }
4385 }
4386 }
4387 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004388 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004389 if (base64bits > 0) { /* left-over bits */
4390 if (base64bits >= 6) {
4391 /* We've seen at least one base-64 character */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004392 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004393 errmsg = "partial character in shift sequence";
4394 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004395 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004396 else {
4397 /* Some bits remain; they should be zero */
4398 if (base64buffer != 0) {
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004399 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004400 errmsg = "non-zero padding bits in shift sequence";
4401 goto utf7Error;
4402 }
4403 }
4404 }
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004405 if (surrogate && DECODE_DIRECT(ch)) {
4406 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4407 goto onError;
4408 }
4409 surrogate = 0;
4410 if (ch == '-') {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004411 /* '-' is absorbed; other terminating
4412 characters are preserved */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004413 s++;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004414 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004415 }
4416 }
4417 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004418 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004419 s++; /* consume '+' */
4420 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004421 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004422 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004423 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004424 }
4425 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004426 inShift = 1;
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004427 surrogate = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004428 shiftOutStart = writer.pos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004429 base64bits = 0;
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004430 base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004431 }
4432 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004433 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004434 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004435 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004436 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004437 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004438 else {
4439 startinpos = s-starts;
4440 s++;
4441 errmsg = "unexpected special character";
4442 goto utf7Error;
4443 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004444 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004445utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004446 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004447 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004448 errors, &errorHandler,
4449 "utf7", errmsg,
4450 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004451 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004452 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004453 }
4454
Antoine Pitrou244651a2009-05-04 18:56:13 +00004455 /* end of string */
4456
4457 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4458 /* if we're in an inconsistent state, that's an error */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004459 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004460 if (surrogate ||
4461 (base64bits >= 6) ||
4462 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004463 endinpos = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004464 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrou244651a2009-05-04 18:56:13 +00004465 errors, &errorHandler,
4466 "utf7", "unterminated shift sequence",
4467 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004468 &writer))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004469 goto onError;
4470 if (s < e)
4471 goto restart;
4472 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004473 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004474
4475 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004476 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004477 if (inShift) {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004478 *consumed = startinpos;
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004479 if (writer.pos != shiftOutStart && writer.maxchar > 127) {
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004480 PyObject *result = PyUnicode_FromKindAndData(
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004481 writer.kind, writer.data, shiftOutStart);
4482 Py_XDECREF(errorHandler);
4483 Py_XDECREF(exc);
4484 _PyUnicodeWriter_Dealloc(&writer);
4485 return result;
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004486 }
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004487 writer.pos = shiftOutStart; /* back off output */
Antoine Pitrou244651a2009-05-04 18:56:13 +00004488 }
4489 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004490 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004491 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004492 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004493
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004494 Py_XDECREF(errorHandler);
4495 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004496 return _PyUnicodeWriter_Finish(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004497
Benjamin Peterson29060642009-01-31 22:14:21 +00004498 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004499 Py_XDECREF(errorHandler);
4500 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004501 _PyUnicodeWriter_Dealloc(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004502 return NULL;
4503}
4504
4505
Alexander Belopolsky40018472011-02-26 01:02:56 +00004506PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004507_PyUnicode_EncodeUTF7(PyObject *str,
4508 int base64SetO,
4509 int base64WhiteSpace,
4510 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004511{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004512 int kind;
4513 void *data;
4514 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004515 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004516 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004517 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004518 unsigned int base64bits = 0;
4519 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004520 char * out;
4521 char * start;
4522
Benjamin Petersonbac79492012-01-14 13:34:47 -05004523 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004524 return NULL;
4525 kind = PyUnicode_KIND(str);
4526 data = PyUnicode_DATA(str);
4527 len = PyUnicode_GET_LENGTH(str);
4528
4529 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004530 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004531
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004532 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004533 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004534 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004535 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004536 if (v == NULL)
4537 return NULL;
4538
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004539 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004540 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004541 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004542
Antoine Pitrou244651a2009-05-04 18:56:13 +00004543 if (inShift) {
4544 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4545 /* shifting out */
4546 if (base64bits) { /* output remaining bits */
4547 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4548 base64buffer = 0;
4549 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004550 }
4551 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004552 /* Characters not in the BASE64 set implicitly unshift the sequence
4553 so no '-' is required, except if the character is itself a '-' */
4554 if (IS_BASE64(ch) || ch == '-') {
4555 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004556 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004557 *out++ = (char) ch;
4558 }
4559 else {
4560 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004561 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004562 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004563 else { /* not in a shift sequence */
4564 if (ch == '+') {
4565 *out++ = '+';
4566 *out++ = '-';
4567 }
4568 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4569 *out++ = (char) ch;
4570 }
4571 else {
4572 *out++ = '+';
4573 inShift = 1;
4574 goto encode_char;
4575 }
4576 }
4577 continue;
4578encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004579 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004580 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004581
Antoine Pitrou244651a2009-05-04 18:56:13 +00004582 /* code first surrogate */
4583 base64bits += 16;
Victor Stinner76df43d2012-10-30 01:42:39 +01004584 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004585 while (base64bits >= 6) {
4586 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4587 base64bits -= 6;
4588 }
4589 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004590 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004591 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004592 base64bits += 16;
4593 base64buffer = (base64buffer << 16) | ch;
4594 while (base64bits >= 6) {
4595 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4596 base64bits -= 6;
4597 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004598 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004599 if (base64bits)
4600 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4601 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004602 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004603 if (_PyBytes_Resize(&v, out - start) < 0)
4604 return NULL;
4605 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004606}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004607PyObject *
4608PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4609 Py_ssize_t size,
4610 int base64SetO,
4611 int base64WhiteSpace,
4612 const char *errors)
4613{
4614 PyObject *result;
4615 PyObject *tmp = PyUnicode_FromUnicode(s, size);
4616 if (tmp == NULL)
4617 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004618 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004619 base64WhiteSpace, errors);
4620 Py_DECREF(tmp);
4621 return result;
4622}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004623
Antoine Pitrou244651a2009-05-04 18:56:13 +00004624#undef IS_BASE64
4625#undef FROM_BASE64
4626#undef TO_BASE64
4627#undef DECODE_DIRECT
4628#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004629
Guido van Rossumd57fd912000-03-10 22:53:23 +00004630/* --- UTF-8 Codec -------------------------------------------------------- */
4631
Alexander Belopolsky40018472011-02-26 01:02:56 +00004632PyObject *
4633PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004634 Py_ssize_t size,
4635 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004636{
Walter Dörwald69652032004-09-07 20:24:22 +00004637 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4638}
4639
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004640#include "stringlib/asciilib.h"
4641#include "stringlib/codecs.h"
4642#include "stringlib/undef.h"
4643
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004644#include "stringlib/ucs1lib.h"
4645#include "stringlib/codecs.h"
4646#include "stringlib/undef.h"
4647
4648#include "stringlib/ucs2lib.h"
4649#include "stringlib/codecs.h"
4650#include "stringlib/undef.h"
4651
4652#include "stringlib/ucs4lib.h"
4653#include "stringlib/codecs.h"
4654#include "stringlib/undef.h"
4655
Antoine Pitrouab868312009-01-10 15:40:25 +00004656/* Mask to quickly check whether a C 'long' contains a
4657 non-ASCII, UTF8-encoded char. */
4658#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004659# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004660#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004661# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004662#else
4663# error C 'long' size should be either 4 or 8!
4664#endif
4665
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004666static Py_ssize_t
4667ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004668{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004669 const char *p = start;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004670 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004671
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004672 /*
4673 * Issue #17237: m68k is a bit different from most architectures in
4674 * that objects do not use "natural alignment" - for example, int and
4675 * long are only aligned at 2-byte boundaries. Therefore the assert()
4676 * won't work; also, tests have shown that skipping the "optimised
4677 * version" will even speed up m68k.
4678 */
4679#if !defined(__m68k__)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004680#if SIZEOF_LONG <= SIZEOF_VOID_P
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004681 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4682 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004683 /* Fast path, see in STRINGLIB(utf8_decode) for
4684 an explanation. */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004685 /* Help allocation */
4686 const char *_p = p;
4687 Py_UCS1 * q = dest;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004688 while (_p < aligned_end) {
4689 unsigned long value = *(const unsigned long *) _p;
4690 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00004691 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004692 *((unsigned long *)q) = value;
4693 _p += SIZEOF_LONG;
4694 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004695 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004696 p = _p;
4697 while (p < end) {
4698 if ((unsigned char)*p & 0x80)
4699 break;
4700 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004701 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004702 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004703 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004704#endif
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004705#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004706 while (p < end) {
4707 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4708 for an explanation. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004709 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004710 /* Help allocation */
4711 const char *_p = p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004712 while (_p < aligned_end) {
4713 unsigned long value = *(unsigned long *) _p;
4714 if (value & ASCII_CHAR_MASK)
4715 break;
4716 _p += SIZEOF_LONG;
4717 }
4718 p = _p;
4719 if (_p == end)
4720 break;
4721 }
4722 if ((unsigned char)*p & 0x80)
4723 break;
4724 ++p;
4725 }
4726 memcpy(dest, start, p - start);
4727 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004728}
Antoine Pitrouab868312009-01-10 15:40:25 +00004729
Victor Stinner785938e2011-12-11 20:09:03 +01004730PyObject *
4731PyUnicode_DecodeUTF8Stateful(const char *s,
4732 Py_ssize_t size,
4733 const char *errors,
4734 Py_ssize_t *consumed)
4735{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004736 _PyUnicodeWriter writer;
Victor Stinner785938e2011-12-11 20:09:03 +01004737 const char *starts = s;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004738 const char *end = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004739
4740 Py_ssize_t startinpos;
4741 Py_ssize_t endinpos;
4742 const char *errmsg = "";
4743 PyObject *errorHandler = NULL;
4744 PyObject *exc = NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004745
4746 if (size == 0) {
4747 if (consumed)
4748 *consumed = 0;
Serhiy Storchaka678db842013-01-26 12:16:36 +02004749 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner785938e2011-12-11 20:09:03 +01004750 }
4751
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004752 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4753 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner785938e2011-12-11 20:09:03 +01004754 if (consumed)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004755 *consumed = 1;
4756 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01004757 }
4758
Victor Stinner8f674cc2013-04-17 23:02:17 +02004759 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02004760 writer.min_length = size;
4761 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004762 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004763
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004764 writer.pos = ascii_decode(s, end, writer.data);
4765 s += writer.pos;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004766 while (s < end) {
4767 Py_UCS4 ch;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004768 int kind = writer.kind;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004769 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004770 if (PyUnicode_IS_ASCII(writer.buffer))
4771 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004772 else
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004773 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004774 } else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004775 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004776 } else {
4777 assert(kind == PyUnicode_4BYTE_KIND);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004778 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004779 }
4780
4781 switch (ch) {
4782 case 0:
4783 if (s == end || consumed)
4784 goto End;
4785 errmsg = "unexpected end of data";
4786 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004787 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004788 break;
4789 case 1:
4790 errmsg = "invalid start byte";
4791 startinpos = s - starts;
4792 endinpos = startinpos + 1;
4793 break;
4794 case 2:
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004795 case 3:
4796 case 4:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004797 errmsg = "invalid continuation byte";
4798 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004799 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004800 break;
4801 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004802 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004803 goto onError;
4804 continue;
4805 }
4806
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004807 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004808 errors, &errorHandler,
4809 "utf-8", errmsg,
4810 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004811 &writer))
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004812 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004813 }
4814
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004815End:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004816 if (consumed)
4817 *consumed = s - starts;
4818
4819 Py_XDECREF(errorHandler);
4820 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004821 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004822
4823onError:
4824 Py_XDECREF(errorHandler);
4825 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004826 _PyUnicodeWriter_Dealloc(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004827 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004828}
4829
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004830#ifdef __APPLE__
4831
4832/* Simplified UTF-8 decoder using surrogateescape error handler,
Victor Stinner0d92c4f2012-11-12 23:32:21 +01004833 used to decode the command line arguments on Mac OS X.
4834
4835 Return a pointer to a newly allocated wide character string (use
Victor Stinner6f8eeee2013-07-07 22:57:45 +02004836 PyMem_RawFree() to free the memory), or NULL on memory allocation error. */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004837
4838wchar_t*
4839_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4840{
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004841 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004842 wchar_t *unicode;
4843 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004844
4845 /* Note: size will always be longer than the resulting Unicode
4846 character count */
Victor Stinner0d92c4f2012-11-12 23:32:21 +01004847 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1))
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004848 return NULL;
Victor Stinner6f8eeee2013-07-07 22:57:45 +02004849 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004850 if (!unicode)
4851 return NULL;
4852
4853 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004854 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004855 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004856 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004857 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004858#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004859 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004860#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004861 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004862#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004863 if (ch > 0xFF) {
4864#if SIZEOF_WCHAR_T == 4
4865 assert(0);
4866#else
4867 assert(Py_UNICODE_IS_SURROGATE(ch));
4868 /* compute and append the two surrogates: */
4869 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
4870 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
4871#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004872 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004873 else {
4874 if (!ch && s == e)
4875 break;
4876 /* surrogateescape */
4877 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
4878 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004879 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004880 unicode[outpos] = L'\0';
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004881 return unicode;
4882}
4883
4884#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00004885
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004886/* Primary internal function which creates utf8 encoded bytes objects.
4887
4888 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00004889 and allocate exactly as much space needed at the end. Else allocate the
4890 maximum possible needed (4 result bytes per Unicode character), and return
4891 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004892*/
Tim Peters7e3d9612002-04-21 03:26:37 +00004893PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01004894_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004895{
Victor Stinner6099a032011-12-18 14:22:26 +01004896 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004897 void *data;
4898 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00004899
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004900 if (!PyUnicode_Check(unicode)) {
4901 PyErr_BadArgument();
4902 return NULL;
4903 }
4904
4905 if (PyUnicode_READY(unicode) == -1)
4906 return NULL;
4907
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004908 if (PyUnicode_UTF8(unicode))
4909 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4910 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004911
4912 kind = PyUnicode_KIND(unicode);
4913 data = PyUnicode_DATA(unicode);
4914 size = PyUnicode_GET_LENGTH(unicode);
4915
Benjamin Petersonead6b532011-12-20 17:23:42 -06004916 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01004917 default:
4918 assert(0);
4919 case PyUnicode_1BYTE_KIND:
4920 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
4921 assert(!PyUnicode_IS_ASCII(unicode));
4922 return ucs1lib_utf8_encoder(unicode, data, size, errors);
4923 case PyUnicode_2BYTE_KIND:
4924 return ucs2lib_utf8_encoder(unicode, data, size, errors);
4925 case PyUnicode_4BYTE_KIND:
4926 return ucs4lib_utf8_encoder(unicode, data, size, errors);
Tim Peters602f7402002-04-27 18:03:26 +00004927 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004928}
4929
Alexander Belopolsky40018472011-02-26 01:02:56 +00004930PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004931PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4932 Py_ssize_t size,
4933 const char *errors)
4934{
4935 PyObject *v, *unicode;
4936
4937 unicode = PyUnicode_FromUnicode(s, size);
4938 if (unicode == NULL)
4939 return NULL;
4940 v = _PyUnicode_AsUTF8String(unicode, errors);
4941 Py_DECREF(unicode);
4942 return v;
4943}
4944
4945PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004946PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004947{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004948 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004949}
4950
Walter Dörwald41980ca2007-08-16 21:55:45 +00004951/* --- UTF-32 Codec ------------------------------------------------------- */
4952
4953PyObject *
4954PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004955 Py_ssize_t size,
4956 const char *errors,
4957 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004958{
4959 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4960}
4961
4962PyObject *
4963PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004964 Py_ssize_t size,
4965 const char *errors,
4966 int *byteorder,
4967 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004968{
4969 const char *starts = s;
4970 Py_ssize_t startinpos;
4971 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004972 _PyUnicodeWriter writer;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004973 const unsigned char *q, *e;
Victor Stinnere64322e2012-10-30 23:12:47 +01004974 int le, bo = 0; /* assume native ordering by default */
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02004975 const char *encoding;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004976 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004977 PyObject *errorHandler = NULL;
4978 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004979
Walter Dörwald41980ca2007-08-16 21:55:45 +00004980 q = (unsigned char *)s;
4981 e = q + size;
4982
4983 if (byteorder)
4984 bo = *byteorder;
4985
4986 /* Check for BOM marks (U+FEFF) in the input and adjust current
4987 byte order setting accordingly. In native mode, the leading BOM
4988 mark is skipped, in all other modes, it is copied to the output
4989 stream as-is (giving a ZWNBSP character). */
Victor Stinnere64322e2012-10-30 23:12:47 +01004990 if (bo == 0 && size >= 4) {
4991 Py_UCS4 bom = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
4992 if (bom == 0x0000FEFF) {
4993 bo = -1;
4994 q += 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00004995 }
Victor Stinnere64322e2012-10-30 23:12:47 +01004996 else if (bom == 0xFFFE0000) {
4997 bo = 1;
4998 q += 4;
4999 }
5000 if (byteorder)
5001 *byteorder = bo;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005002 }
5003
Victor Stinnere64322e2012-10-30 23:12:47 +01005004 if (q == e) {
5005 if (consumed)
5006 *consumed = size;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005007 _Py_RETURN_UNICODE_EMPTY();
Walter Dörwald41980ca2007-08-16 21:55:45 +00005008 }
5009
Victor Stinnere64322e2012-10-30 23:12:47 +01005010#ifdef WORDS_BIGENDIAN
5011 le = bo < 0;
5012#else
5013 le = bo <= 0;
5014#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005015 encoding = le ? "utf-32-le" : "utf-32-be";
Victor Stinnere64322e2012-10-30 23:12:47 +01005016
Victor Stinner8f674cc2013-04-17 23:02:17 +02005017 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005018 writer.min_length = (e - q + 3) / 4;
5019 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005020 goto onError;
Victor Stinnere64322e2012-10-30 23:12:47 +01005021
Victor Stinnere64322e2012-10-30 23:12:47 +01005022 while (1) {
5023 Py_UCS4 ch = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005024 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005025
Victor Stinnere64322e2012-10-30 23:12:47 +01005026 if (e - q >= 4) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005027 enum PyUnicode_Kind kind = writer.kind;
5028 void *data = writer.data;
Victor Stinnere64322e2012-10-30 23:12:47 +01005029 const unsigned char *last = e - 4;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005030 Py_ssize_t pos = writer.pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005031 if (le) {
5032 do {
5033 ch = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5034 if (ch > maxch)
5035 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005036 if (kind != PyUnicode_1BYTE_KIND &&
5037 Py_UNICODE_IS_SURROGATE(ch))
5038 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005039 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005040 q += 4;
5041 } while (q <= last);
5042 }
5043 else {
5044 do {
5045 ch = (q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
5046 if (ch > maxch)
5047 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005048 if (kind != PyUnicode_1BYTE_KIND &&
5049 Py_UNICODE_IS_SURROGATE(ch))
5050 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005051 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005052 q += 4;
5053 } while (q <= last);
5054 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005055 writer.pos = pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005056 }
5057
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005058 if (Py_UNICODE_IS_SURROGATE(ch)) {
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005059 errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005060 startinpos = ((const char *)q) - starts;
5061 endinpos = startinpos + 4;
5062 }
5063 else if (ch <= maxch) {
Victor Stinnere64322e2012-10-30 23:12:47 +01005064 if (q == e || consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005065 break;
Victor Stinnere64322e2012-10-30 23:12:47 +01005066 /* remaining bytes at the end? (size should be divisible by 4) */
Benjamin Peterson29060642009-01-31 22:14:21 +00005067 errmsg = "truncated data";
Victor Stinnere64322e2012-10-30 23:12:47 +01005068 startinpos = ((const char *)q) - starts;
5069 endinpos = ((const char *)e) - starts;
Benjamin Peterson29060642009-01-31 22:14:21 +00005070 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005071 else {
5072 if (ch < 0x110000) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005073 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnere64322e2012-10-30 23:12:47 +01005074 goto onError;
5075 q += 4;
5076 continue;
5077 }
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005078 errmsg = "code point not in range(0x110000)";
Victor Stinnere64322e2012-10-30 23:12:47 +01005079 startinpos = ((const char *)q) - starts;
5080 endinpos = startinpos + 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005081 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005082
5083 /* The remaining input chars are ignored if the callback
5084 chooses to skip the input */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005085 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005086 errors, &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005087 encoding, errmsg,
Benjamin Peterson29060642009-01-31 22:14:21 +00005088 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005089 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005090 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005091 }
5092
Walter Dörwald41980ca2007-08-16 21:55:45 +00005093 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005094 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005095
Walter Dörwald41980ca2007-08-16 21:55:45 +00005096 Py_XDECREF(errorHandler);
5097 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005098 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005099
Benjamin Peterson29060642009-01-31 22:14:21 +00005100 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005101 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005102 Py_XDECREF(errorHandler);
5103 Py_XDECREF(exc);
5104 return NULL;
5105}
5106
5107PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005108_PyUnicode_EncodeUTF32(PyObject *str,
5109 const char *errors,
5110 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005111{
Serhiy Storchaka30793282014-01-04 22:44:01 +02005112 int kind;
5113 void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005114 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005115 PyObject *v;
Serhiy Storchaka30793282014-01-04 22:44:01 +02005116 unsigned char *p;
5117 Py_ssize_t nsize, i;
5118 /* Offsets from p for storing byte pairs in the right order. */
Christian Heimes743e0cd2012-10-17 23:52:17 +02005119#if PY_LITTLE_ENDIAN
Serhiy Storchaka30793282014-01-04 22:44:01 +02005120 int iorder[] = {0, 1, 2, 3};
Walter Dörwald41980ca2007-08-16 21:55:45 +00005121#else
Serhiy Storchaka30793282014-01-04 22:44:01 +02005122 int iorder[] = {3, 2, 1, 0};
Walter Dörwald41980ca2007-08-16 21:55:45 +00005123#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005124 const char *encoding;
5125 PyObject *errorHandler = NULL;
5126 PyObject *exc = NULL;
5127 PyObject *rep = NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005128
Serhiy Storchaka30793282014-01-04 22:44:01 +02005129#define STORECHAR(CH) \
5130 do { \
5131 p[iorder[3]] = ((CH) >> 24) & 0xff; \
5132 p[iorder[2]] = ((CH) >> 16) & 0xff; \
5133 p[iorder[1]] = ((CH) >> 8) & 0xff; \
5134 p[iorder[0]] = (CH) & 0xff; \
5135 p += 4; \
5136 } while(0)
5137
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005138 if (!PyUnicode_Check(str)) {
5139 PyErr_BadArgument();
5140 return NULL;
5141 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005142 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005143 return NULL;
5144 kind = PyUnicode_KIND(str);
5145 data = PyUnicode_DATA(str);
5146 len = PyUnicode_GET_LENGTH(str);
5147
Serhiy Storchaka583a9392014-01-04 19:25:37 +02005148 nsize = len + (byteorder == 0);
Serhiy Storchaka30793282014-01-04 22:44:01 +02005149 if (nsize > PY_SSIZE_T_MAX / 4)
5150 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005151 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005152 if (v == NULL)
5153 return NULL;
5154
Serhiy Storchaka30793282014-01-04 22:44:01 +02005155 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005156 if (byteorder == 0)
Serhiy Storchaka30793282014-01-04 22:44:01 +02005157 STORECHAR(0xFEFF);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005158 if (len == 0)
Serhiy Storchaka30793282014-01-04 22:44:01 +02005159 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005160
Serhiy Storchaka30793282014-01-04 22:44:01 +02005161 if (byteorder == -1) {
5162 /* force LE */
5163 iorder[0] = 0;
5164 iorder[1] = 1;
5165 iorder[2] = 2;
5166 iorder[3] = 3;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005167 encoding = "utf-32-le";
Serhiy Storchaka30793282014-01-04 22:44:01 +02005168 }
5169 else if (byteorder == 1) {
5170 /* force BE */
5171 iorder[0] = 3;
5172 iorder[1] = 2;
5173 iorder[2] = 1;
5174 iorder[3] = 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005175 encoding = "utf-32-be";
Serhiy Storchaka30793282014-01-04 22:44:01 +02005176 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005177 else
5178 encoding = "utf-32";
5179
5180 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchaka30793282014-01-04 22:44:01 +02005181 for (i = 0; i < len; i++)
5182 STORECHAR(PyUnicode_READ(kind, data, i));
5183 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005184 }
5185
Serhiy Storchaka30793282014-01-04 22:44:01 +02005186 for (i = 0; i < len;) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005187 Py_ssize_t repsize, moreunits;
Serhiy Storchaka30793282014-01-04 22:44:01 +02005188 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
5189 i++;
5190 assert(ch <= MAX_UNICODE);
5191 if (!Py_UNICODE_IS_SURROGATE(ch)) {
5192 STORECHAR(ch);
5193 continue;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005194 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005195
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005196 rep = unicode_encode_call_errorhandler(
5197 errors, &errorHandler,
5198 encoding, "surrogates not allowed",
Serhiy Storchaka30793282014-01-04 22:44:01 +02005199 str, &exc, i-1, i, &i);
5200
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005201 if (!rep)
5202 goto error;
5203
5204 if (PyBytes_Check(rep)) {
5205 repsize = PyBytes_GET_SIZE(rep);
5206 if (repsize & 3) {
5207 raise_encode_exception(&exc, encoding,
Serhiy Storchaka30793282014-01-04 22:44:01 +02005208 str, i - 1, i,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005209 "surrogates not allowed");
5210 goto error;
5211 }
5212 moreunits = repsize / 4;
5213 }
5214 else {
5215 assert(PyUnicode_Check(rep));
5216 if (PyUnicode_READY(rep) < 0)
5217 goto error;
5218 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5219 if (!PyUnicode_IS_ASCII(rep)) {
5220 raise_encode_exception(&exc, encoding,
Serhiy Storchaka30793282014-01-04 22:44:01 +02005221 str, i - 1, i,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005222 "surrogates not allowed");
5223 goto error;
5224 }
5225 }
5226
5227 /* four bytes are reserved for each surrogate */
5228 if (moreunits > 1) {
Serhiy Storchaka30793282014-01-04 22:44:01 +02005229 Py_ssize_t outpos = p - (unsigned char*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005230 Py_ssize_t morebytes = 4 * (moreunits - 1);
5231 if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) {
5232 /* integer overflow */
5233 PyErr_NoMemory();
5234 goto error;
5235 }
5236 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0)
5237 goto error;
Serhiy Storchaka30793282014-01-04 22:44:01 +02005238 p = (unsigned char*) PyBytes_AS_STRING(v) + outpos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005239 }
5240
5241 if (PyBytes_Check(rep)) {
Serhiy Storchaka30793282014-01-04 22:44:01 +02005242 Py_MEMCPY(p, PyBytes_AS_STRING(rep), repsize);
5243 p += repsize;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005244 } else /* rep is unicode */ {
Serhiy Storchaka30793282014-01-04 22:44:01 +02005245 const Py_UCS1 *repdata;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005246 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
Serhiy Storchaka30793282014-01-04 22:44:01 +02005247 repdata = PyUnicode_1BYTE_DATA(rep);
5248 while (repsize--) {
5249 Py_UCS4 ch = *repdata++;
5250 STORECHAR(ch);
5251 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005252 }
5253
5254 Py_CLEAR(rep);
5255 }
5256
5257 /* Cut back to size actually needed. This is necessary for, for example,
5258 encoding of a string containing isolated surrogates and the 'ignore'
5259 handler is used. */
Serhiy Storchaka30793282014-01-04 22:44:01 +02005260 nsize = p - (unsigned char*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005261 if (nsize != PyBytes_GET_SIZE(v))
5262 _PyBytes_Resize(&v, nsize);
5263 Py_XDECREF(errorHandler);
5264 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005265 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005266 error:
5267 Py_XDECREF(rep);
5268 Py_XDECREF(errorHandler);
5269 Py_XDECREF(exc);
5270 Py_XDECREF(v);
5271 return NULL;
Serhiy Storchaka30793282014-01-04 22:44:01 +02005272#undef STORECHAR
Walter Dörwald41980ca2007-08-16 21:55:45 +00005273}
5274
Alexander Belopolsky40018472011-02-26 01:02:56 +00005275PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005276PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5277 Py_ssize_t size,
5278 const char *errors,
5279 int byteorder)
5280{
5281 PyObject *result;
5282 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5283 if (tmp == NULL)
5284 return NULL;
5285 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5286 Py_DECREF(tmp);
5287 return result;
5288}
5289
5290PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005291PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005292{
Victor Stinnerb960b342011-11-20 19:12:52 +01005293 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005294}
5295
Guido van Rossumd57fd912000-03-10 22:53:23 +00005296/* --- UTF-16 Codec ------------------------------------------------------- */
5297
Tim Peters772747b2001-08-09 22:21:55 +00005298PyObject *
5299PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005300 Py_ssize_t size,
5301 const char *errors,
5302 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005303{
Walter Dörwald69652032004-09-07 20:24:22 +00005304 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5305}
5306
5307PyObject *
5308PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005309 Py_ssize_t size,
5310 const char *errors,
5311 int *byteorder,
5312 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005313{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005314 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005315 Py_ssize_t startinpos;
5316 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005317 _PyUnicodeWriter writer;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005318 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005319 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005320 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005321 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005322 PyObject *errorHandler = NULL;
5323 PyObject *exc = NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005324 const char *encoding;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005325
Tim Peters772747b2001-08-09 22:21:55 +00005326 q = (unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005327 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005328
5329 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005330 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005331
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005332 /* Check for BOM marks (U+FEFF) in the input and adjust current
5333 byte order setting accordingly. In native mode, the leading BOM
5334 mark is skipped, in all other modes, it is copied to the output
5335 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005336 if (bo == 0 && size >= 2) {
5337 const Py_UCS4 bom = (q[1] << 8) | q[0];
5338 if (bom == 0xFEFF) {
5339 q += 2;
5340 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005341 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005342 else if (bom == 0xFFFE) {
5343 q += 2;
5344 bo = 1;
5345 }
5346 if (byteorder)
5347 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005348 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005349
Antoine Pitrou63065d72012-05-15 23:48:04 +02005350 if (q == e) {
5351 if (consumed)
5352 *consumed = size;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005353 _Py_RETURN_UNICODE_EMPTY();
Tim Peters772747b2001-08-09 22:21:55 +00005354 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005355
Christian Heimes743e0cd2012-10-17 23:52:17 +02005356#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02005357 native_ordering = bo <= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005358 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
Antoine Pitrouab868312009-01-10 15:40:25 +00005359#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02005360 native_ordering = bo >= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005361 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
Antoine Pitrouab868312009-01-10 15:40:25 +00005362#endif
Tim Peters772747b2001-08-09 22:21:55 +00005363
Antoine Pitrou63065d72012-05-15 23:48:04 +02005364 /* Note: size will always be longer than the resulting Unicode
5365 character count */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005366 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005367 writer.min_length = (e - q + 1) / 2;
5368 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005369 goto onError;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005370
Antoine Pitrou63065d72012-05-15 23:48:04 +02005371 while (1) {
5372 Py_UCS4 ch = 0;
5373 if (e - q >= 2) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005374 int kind = writer.kind;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005375 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005376 if (PyUnicode_IS_ASCII(writer.buffer))
Antoine Pitrou63065d72012-05-15 23:48:04 +02005377 ch = asciilib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005378 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005379 native_ordering);
5380 else
5381 ch = ucs1lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005382 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005383 native_ordering);
5384 } else if (kind == PyUnicode_2BYTE_KIND) {
5385 ch = ucs2lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005386 (Py_UCS2*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005387 native_ordering);
5388 } else {
5389 assert(kind == PyUnicode_4BYTE_KIND);
5390 ch = ucs4lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005391 (Py_UCS4*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005392 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00005393 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005394 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005395
Antoine Pitrou63065d72012-05-15 23:48:04 +02005396 switch (ch)
5397 {
5398 case 0:
5399 /* remaining byte at the end? (size should be even) */
5400 if (q == e || consumed)
5401 goto End;
5402 errmsg = "truncated data";
5403 startinpos = ((const char *)q) - starts;
5404 endinpos = ((const char *)e) - starts;
5405 break;
5406 /* The remaining input chars are ignored if the callback
5407 chooses to skip the input */
5408 case 1:
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005409 q -= 2;
5410 if (consumed)
Serhiy Storchakaae3b32a2013-01-08 23:40:52 +02005411 goto End;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005412 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005413 startinpos = ((const char *)q) - starts;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005414 endinpos = ((const char *)e) - starts;
5415 break;
5416 case 2:
5417 errmsg = "illegal encoding";
5418 startinpos = ((const char *)q) - 2 - starts;
5419 endinpos = startinpos + 2;
5420 break;
5421 case 3:
5422 errmsg = "illegal UTF-16 surrogate";
5423 startinpos = ((const char *)q) - 4 - starts;
5424 endinpos = startinpos + 2;
5425 break;
5426 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005427 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005428 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005429 continue;
5430 }
5431
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005432 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouab868312009-01-10 15:40:25 +00005433 errors,
5434 &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005435 encoding, errmsg,
Antoine Pitrouab868312009-01-10 15:40:25 +00005436 &starts,
5437 (const char **)&e,
5438 &startinpos,
5439 &endinpos,
5440 &exc,
5441 (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005442 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005443 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005444 }
5445
Antoine Pitrou63065d72012-05-15 23:48:04 +02005446End:
Walter Dörwald69652032004-09-07 20:24:22 +00005447 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005448 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005449
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005450 Py_XDECREF(errorHandler);
5451 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005452 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005453
Benjamin Peterson29060642009-01-31 22:14:21 +00005454 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005455 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005456 Py_XDECREF(errorHandler);
5457 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005458 return NULL;
5459}
5460
Tim Peters772747b2001-08-09 22:21:55 +00005461PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005462_PyUnicode_EncodeUTF16(PyObject *str,
5463 const char *errors,
5464 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005465{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005466 enum PyUnicode_Kind kind;
5467 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005468 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005469 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005470 unsigned short *out;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005471 Py_ssize_t pairs;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005472#if PY_BIG_ENDIAN
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005473 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005474#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005475 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005476#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005477 const char *encoding;
5478 Py_ssize_t nsize, pos;
5479 PyObject *errorHandler = NULL;
5480 PyObject *exc = NULL;
5481 PyObject *rep = NULL;
Tim Peters772747b2001-08-09 22:21:55 +00005482
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005483 if (!PyUnicode_Check(str)) {
5484 PyErr_BadArgument();
5485 return NULL;
5486 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005487 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005488 return NULL;
5489 kind = PyUnicode_KIND(str);
5490 data = PyUnicode_DATA(str);
5491 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005492
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005493 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005494 if (kind == PyUnicode_4BYTE_KIND) {
5495 const Py_UCS4 *in = (const Py_UCS4 *)data;
5496 const Py_UCS4 *end = in + len;
5497 while (in < end)
5498 if (*in++ >= 0x10000)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005499 pairs++;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005500 }
5501 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005502 return PyErr_NoMemory();
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005503 nsize = len + pairs + (byteorder == 0);
5504 v = PyBytes_FromStringAndSize(NULL, nsize * 2);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005505 if (v == NULL)
5506 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005507
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005508 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02005509 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005510 out = (unsigned short *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005511 if (byteorder == 0)
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005512 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005513 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005514 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005515
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005516 if (kind == PyUnicode_1BYTE_KIND) {
5517 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5518 goto done;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005519 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005520
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005521 if (byteorder < 0)
5522 encoding = "utf-16-le";
5523 else if (byteorder > 0)
5524 encoding = "utf-16-be";
5525 else
5526 encoding = "utf-16";
5527
5528 pos = 0;
5529 while (pos < len) {
5530 Py_ssize_t repsize, moreunits;
5531
5532 if (kind == PyUnicode_2BYTE_KIND) {
5533 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
5534 &out, native_ordering);
5535 }
5536 else {
5537 assert(kind == PyUnicode_4BYTE_KIND);
5538 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
5539 &out, native_ordering);
5540 }
5541 if (pos == len)
5542 break;
5543
5544 rep = unicode_encode_call_errorhandler(
5545 errors, &errorHandler,
5546 encoding, "surrogates not allowed",
5547 str, &exc, pos, pos + 1, &pos);
5548 if (!rep)
5549 goto error;
5550
5551 if (PyBytes_Check(rep)) {
5552 repsize = PyBytes_GET_SIZE(rep);
5553 if (repsize & 1) {
5554 raise_encode_exception(&exc, encoding,
5555 str, pos - 1, pos,
5556 "surrogates not allowed");
5557 goto error;
5558 }
5559 moreunits = repsize / 2;
5560 }
5561 else {
5562 assert(PyUnicode_Check(rep));
5563 if (PyUnicode_READY(rep) < 0)
5564 goto error;
5565 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5566 if (!PyUnicode_IS_ASCII(rep)) {
5567 raise_encode_exception(&exc, encoding,
5568 str, pos - 1, pos,
5569 "surrogates not allowed");
5570 goto error;
5571 }
5572 }
5573
5574 /* two bytes are reserved for each surrogate */
5575 if (moreunits > 1) {
5576 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
5577 Py_ssize_t morebytes = 2 * (moreunits - 1);
5578 if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) {
5579 /* integer overflow */
5580 PyErr_NoMemory();
5581 goto error;
5582 }
5583 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0)
5584 goto error;
5585 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
5586 }
5587
5588 if (PyBytes_Check(rep)) {
5589 Py_MEMCPY(out, PyBytes_AS_STRING(rep), repsize);
5590 out += moreunits;
5591 } else /* rep is unicode */ {
5592 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
5593 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5594 &out, native_ordering);
5595 }
5596
5597 Py_CLEAR(rep);
5598 }
5599
5600 /* Cut back to size actually needed. This is necessary for, for example,
5601 encoding of a string containing isolated surrogates and the 'ignore' handler
5602 is used. */
5603 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
5604 if (nsize != PyBytes_GET_SIZE(v))
5605 _PyBytes_Resize(&v, nsize);
5606 Py_XDECREF(errorHandler);
5607 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00005608 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005609 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005610 error:
5611 Py_XDECREF(rep);
5612 Py_XDECREF(errorHandler);
5613 Py_XDECREF(exc);
5614 Py_XDECREF(v);
5615 return NULL;
5616#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005617}
5618
Alexander Belopolsky40018472011-02-26 01:02:56 +00005619PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005620PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5621 Py_ssize_t size,
5622 const char *errors,
5623 int byteorder)
5624{
5625 PyObject *result;
5626 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5627 if (tmp == NULL)
5628 return NULL;
5629 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5630 Py_DECREF(tmp);
5631 return result;
5632}
5633
5634PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005635PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005636{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005637 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005638}
5639
5640/* --- Unicode Escape Codec ----------------------------------------------- */
5641
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005642/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5643 if all the escapes in the string make it still a valid ASCII string.
5644 Returns -1 if any escapes were found which cause the string to
5645 pop out of ASCII range. Otherwise returns the length of the
5646 required buffer to hold the string.
5647 */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02005648static Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005649length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5650{
5651 const unsigned char *p = (const unsigned char *)s;
5652 const unsigned char *end = p + size;
5653 Py_ssize_t length = 0;
5654
5655 if (size < 0)
5656 return -1;
5657
5658 for (; p < end; ++p) {
5659 if (*p > 127) {
5660 /* Non-ASCII */
5661 return -1;
5662 }
5663 else if (*p != '\\') {
5664 /* Normal character */
5665 ++length;
5666 }
5667 else {
5668 /* Backslash-escape, check next char */
5669 ++p;
5670 /* Escape sequence reaches till end of string or
5671 non-ASCII follow-up. */
5672 if (p >= end || *p > 127)
5673 return -1;
5674 switch (*p) {
5675 case '\n':
5676 /* backslash + \n result in zero characters */
5677 break;
5678 case '\\': case '\'': case '\"':
5679 case 'b': case 'f': case 't':
5680 case 'n': case 'r': case 'v': case 'a':
5681 ++length;
5682 break;
5683 case '0': case '1': case '2': case '3':
5684 case '4': case '5': case '6': case '7':
5685 case 'x': case 'u': case 'U': case 'N':
5686 /* these do not guarantee ASCII characters */
5687 return -1;
5688 default:
5689 /* count the backslash + the other character */
5690 length += 2;
5691 }
5692 }
5693 }
5694 return length;
5695}
5696
Fredrik Lundh06d12682001-01-24 07:59:11 +00005697static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005698
Alexander Belopolsky40018472011-02-26 01:02:56 +00005699PyObject *
5700PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005701 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005702 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005703{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005704 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005705 Py_ssize_t startinpos;
5706 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005707 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005708 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005709 char* message;
5710 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005711 PyObject *errorHandler = NULL;
5712 PyObject *exc = NULL;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005713 Py_ssize_t len;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005714
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005715 len = length_of_escaped_ascii_string(s, size);
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005716 if (len == 0)
5717 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005718
5719 /* After length_of_escaped_ascii_string() there are two alternatives,
5720 either the string is pure ASCII with named escapes like \n, etc.
5721 and we determined it's exact size (common case)
5722 or it contains \x, \u, ... escape sequences. then we create a
5723 legacy wchar string and resize it at the end of this function. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005724 _PyUnicodeWriter_Init(&writer);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005725 if (len > 0) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02005726 writer.min_length = len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005727 }
5728 else {
5729 /* Escaped strings will always be longer than the resulting
5730 Unicode string, so we start with size here and then reduce the
5731 length after conversion to the true value.
5732 (but if the error callback returns a long replacement string
5733 we'll have to allocate more space) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005734 writer.min_length = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005735 }
5736
Guido van Rossumd57fd912000-03-10 22:53:23 +00005737 if (size == 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005738 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005739 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005740
Guido van Rossumd57fd912000-03-10 22:53:23 +00005741 while (s < end) {
5742 unsigned char c;
Victor Stinner24729f32011-11-10 20:31:37 +01005743 Py_UCS4 x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005744 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005745
5746 /* Non-escape characters are interpreted as Unicode ordinals */
5747 if (*s != '\\') {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005748 x = (unsigned char)*s;
5749 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005750 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005751 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005752 continue;
5753 }
5754
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005755 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005756 /* \ - Escapes */
5757 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005758 c = *s++;
5759 if (s > end)
5760 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005761
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005762 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005763
Benjamin Peterson29060642009-01-31 22:14:21 +00005764 /* \x escapes */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005765#define WRITECHAR(ch) \
5766 do { \
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005767 if (_PyUnicodeWriter_WriteCharInline(&writer, (ch)) < 0) \
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005768 goto onError; \
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005769 } while(0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005770
Guido van Rossumd57fd912000-03-10 22:53:23 +00005771 case '\n': break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005772 case '\\': WRITECHAR('\\'); break;
5773 case '\'': WRITECHAR('\''); break;
5774 case '\"': WRITECHAR('\"'); break;
5775 case 'b': WRITECHAR('\b'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005776 /* FF */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005777 case 'f': WRITECHAR('\014'); break;
5778 case 't': WRITECHAR('\t'); break;
5779 case 'n': WRITECHAR('\n'); break;
5780 case 'r': WRITECHAR('\r'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005781 /* VT */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005782 case 'v': WRITECHAR('\013'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005783 /* BEL, not classic C */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005784 case 'a': WRITECHAR('\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005785
Benjamin Peterson29060642009-01-31 22:14:21 +00005786 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005787 case '0': case '1': case '2': case '3':
5788 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005789 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005790 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005791 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005792 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005793 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005794 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005795 WRITECHAR(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005796 break;
5797
Benjamin Peterson29060642009-01-31 22:14:21 +00005798 /* hex escapes */
5799 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005800 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005801 digits = 2;
5802 message = "truncated \\xXX escape";
5803 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005804
Benjamin Peterson29060642009-01-31 22:14:21 +00005805 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005806 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005807 digits = 4;
5808 message = "truncated \\uXXXX escape";
5809 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005810
Benjamin Peterson29060642009-01-31 22:14:21 +00005811 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005812 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005813 digits = 8;
5814 message = "truncated \\UXXXXXXXX escape";
5815 hexescape:
5816 chr = 0;
Serhiy Storchakad6793772013-01-29 10:20:44 +02005817 if (end - s < digits) {
5818 /* count only hex digits */
5819 for (; s < end; ++s) {
5820 c = (unsigned char)*s;
5821 if (!Py_ISXDIGIT(c))
5822 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005823 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02005824 goto error;
5825 }
5826 for (; digits--; ++s) {
5827 c = (unsigned char)*s;
5828 if (!Py_ISXDIGIT(c))
5829 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005830 chr = (chr<<4) & ~0xF;
5831 if (c >= '0' && c <= '9')
5832 chr += c - '0';
5833 else if (c >= 'a' && c <= 'f')
5834 chr += 10 + c - 'a';
5835 else
5836 chr += 10 + c - 'A';
5837 }
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005838 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005839 /* _decoding_error will have already written into the
5840 target buffer. */
5841 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005842 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005843 /* when we get here, chr is a 32-bit unicode character */
Serhiy Storchaka24193de2013-01-29 10:28:07 +02005844 message = "illegal Unicode character";
5845 if (chr > MAX_UNICODE)
Serhiy Storchakad6793772013-01-29 10:20:44 +02005846 goto error;
Serhiy Storchaka24193de2013-01-29 10:28:07 +02005847 WRITECHAR(chr);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005848 break;
5849
Benjamin Peterson29060642009-01-31 22:14:21 +00005850 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005851 case 'N':
5852 message = "malformed \\N character escape";
5853 if (ucnhash_CAPI == NULL) {
5854 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005855 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5856 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005857 if (ucnhash_CAPI == NULL)
5858 goto ucnhashError;
5859 }
5860 if (*s == '{') {
5861 const char *start = s+1;
5862 /* look for the closing brace */
5863 while (*s != '}' && s < end)
5864 s++;
5865 if (s > start && s < end && *s == '}') {
5866 /* found a name. look it up in the unicode database */
5867 message = "unknown Unicode character name";
5868 s++;
Serhiy Storchaka4f5f0e52013-01-21 11:38:00 +02005869 if (s - start - 1 <= INT_MAX &&
Serhiy Storchakac35f3a92013-01-21 11:42:57 +02005870 ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
Ezio Melotti931b8aa2011-10-21 21:57:36 +03005871 &chr, 0))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005872 goto store;
5873 }
5874 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02005875 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005876
5877 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005878 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005879 message = "\\ at end of string";
5880 s--;
Serhiy Storchakad6793772013-01-29 10:20:44 +02005881 goto error;
Walter Dörwald8c077222002-03-25 11:16:18 +00005882 }
5883 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005884 WRITECHAR('\\');
Serhiy Storchaka73e38802013-01-25 23:52:21 +02005885 WRITECHAR((unsigned char)s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005886 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005887 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005888 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02005889 continue;
5890
5891 error:
5892 endinpos = s-starts;
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02005893 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchakad6793772013-01-29 10:20:44 +02005894 errors, &errorHandler,
5895 "unicodeescape", message,
5896 &starts, &end, &startinpos, &endinpos, &exc, &s,
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02005897 &writer))
Serhiy Storchakad6793772013-01-29 10:20:44 +02005898 goto onError;
5899 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005900 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005901#undef WRITECHAR
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005902
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005903 Py_XDECREF(errorHandler);
5904 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005905 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald8c077222002-03-25 11:16:18 +00005906
Benjamin Peterson29060642009-01-31 22:14:21 +00005907 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005908 PyErr_SetString(
5909 PyExc_UnicodeError,
5910 "\\N escapes not supported (can't load unicodedata module)"
5911 );
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005912 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005913 Py_XDECREF(errorHandler);
5914 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005915 return NULL;
5916
Benjamin Peterson29060642009-01-31 22:14:21 +00005917 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005918 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005919 Py_XDECREF(errorHandler);
5920 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005921 return NULL;
5922}
5923
5924/* Return a Unicode-Escape string version of the Unicode object.
5925
5926 If quotes is true, the string is enclosed in u"" or u'' quotes as
5927 appropriate.
5928
5929*/
5930
Alexander Belopolsky40018472011-02-26 01:02:56 +00005931PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005932PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005933{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005934 Py_ssize_t i, len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005935 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005936 char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005937 int kind;
5938 void *data;
5939 Py_ssize_t expandsize = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005940
Ezio Melottie7f90372012-10-05 03:33:31 +03005941 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00005942 escape.
5943
Ezio Melottie7f90372012-10-05 03:33:31 +03005944 For UCS1 strings it's '\xxx', 4 bytes per source character.
5945 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
5946 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00005947 */
5948
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005949 if (!PyUnicode_Check(unicode)) {
5950 PyErr_BadArgument();
5951 return NULL;
5952 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005953 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005954 return NULL;
5955 len = PyUnicode_GET_LENGTH(unicode);
5956 kind = PyUnicode_KIND(unicode);
5957 data = PyUnicode_DATA(unicode);
Benjamin Petersonead6b532011-12-20 17:23:42 -06005958 switch (kind) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005959 case PyUnicode_1BYTE_KIND: expandsize = 4; break;
5960 case PyUnicode_2BYTE_KIND: expandsize = 6; break;
5961 case PyUnicode_4BYTE_KIND: expandsize = 10; break;
5962 }
5963
5964 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005965 return PyBytes_FromStringAndSize(NULL, 0);
5966
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005967 if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005968 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005969
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005970 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005971 2
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005972 + expandsize*len
Benjamin Peterson29060642009-01-31 22:14:21 +00005973 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005974 if (repr == NULL)
5975 return NULL;
5976
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005977 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005978
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005979 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01005980 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005981
Walter Dörwald79e913e2007-05-12 11:08:06 +00005982 /* Escape backslashes */
5983 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005984 *p++ = '\\';
5985 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005986 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005987 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005988
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005989 /* Map 21-bit characters to '\U00xxxxxx' */
5990 else if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01005991 assert(ch <= MAX_UNICODE);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005992 *p++ = '\\';
5993 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005994 *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
5995 *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F];
5996 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
5997 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
5998 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
5999 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6000 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6001 *p++ = Py_hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00006002 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006003 }
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00006004
Guido van Rossumd57fd912000-03-10 22:53:23 +00006005 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00006006 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006007 *p++ = '\\';
6008 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006009 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6010 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6011 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6012 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006013 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006014
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006015 /* Map special whitespace to '\t', \n', '\r' */
6016 else if (ch == '\t') {
6017 *p++ = '\\';
6018 *p++ = 't';
6019 }
6020 else if (ch == '\n') {
6021 *p++ = '\\';
6022 *p++ = 'n';
6023 }
6024 else if (ch == '\r') {
6025 *p++ = '\\';
6026 *p++ = 'r';
6027 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006028
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006029 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00006030 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006031 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006032 *p++ = 'x';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006033 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6034 *p++ = Py_hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00006035 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006036
Guido van Rossumd57fd912000-03-10 22:53:23 +00006037 /* Copy everything else as-is */
6038 else
6039 *p++ = (char) ch;
6040 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006041
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006042 assert(p - PyBytes_AS_STRING(repr) > 0);
6043 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
6044 return NULL;
6045 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006046}
6047
Alexander Belopolsky40018472011-02-26 01:02:56 +00006048PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006049PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6050 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006051{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006052 PyObject *result;
6053 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6054 if (tmp == NULL)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006055 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006056 result = PyUnicode_AsUnicodeEscapeString(tmp);
6057 Py_DECREF(tmp);
6058 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006059}
6060
6061/* --- Raw Unicode Escape Codec ------------------------------------------- */
6062
Alexander Belopolsky40018472011-02-26 01:02:56 +00006063PyObject *
6064PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006065 Py_ssize_t size,
6066 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006067{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006068 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006069 Py_ssize_t startinpos;
6070 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006071 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006072 const char *end;
6073 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006074 PyObject *errorHandler = NULL;
6075 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006076
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006077 if (size == 0)
6078 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006079
Guido van Rossumd57fd912000-03-10 22:53:23 +00006080 /* Escaped strings will always be longer than the resulting
6081 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006082 length after conversion to the true value. (But decoding error
6083 handler might have to resize the string) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006084 _PyUnicodeWriter_Init(&writer);
6085 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006086
Guido van Rossumd57fd912000-03-10 22:53:23 +00006087 end = s + size;
6088 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006089 unsigned char c;
6090 Py_UCS4 x;
6091 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006092 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006093
Benjamin Peterson29060642009-01-31 22:14:21 +00006094 /* Non-escape characters are interpreted as Unicode ordinals */
6095 if (*s != '\\') {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006096 x = (unsigned char)*s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006097 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006098 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006099 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006100 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006101 startinpos = s-starts;
6102
6103 /* \u-escapes are only interpreted iff the number of leading
6104 backslashes if odd */
6105 bs = s;
6106 for (;s < end;) {
6107 if (*s != '\\')
6108 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006109 x = (unsigned char)*s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006110 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006111 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006112 }
6113 if (((s - bs) & 1) == 0 ||
6114 s >= end ||
6115 (*s != 'u' && *s != 'U')) {
6116 continue;
6117 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006118 writer.pos--;
Benjamin Peterson29060642009-01-31 22:14:21 +00006119 count = *s=='u' ? 4 : 8;
6120 s++;
6121
6122 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Benjamin Peterson29060642009-01-31 22:14:21 +00006123 for (x = 0, i = 0; i < count; ++i, ++s) {
6124 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00006125 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006126 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006127 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00006128 errors, &errorHandler,
6129 "rawunicodeescape", "truncated \\uXXXX",
6130 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006131 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006132 goto onError;
6133 goto nextByte;
6134 }
6135 x = (x<<4) & ~0xF;
6136 if (c >= '0' && c <= '9')
6137 x += c - '0';
6138 else if (c >= 'a' && c <= 'f')
6139 x += 10 + c - 'a';
6140 else
6141 x += 10 + c - 'A';
6142 }
Victor Stinner8faf8212011-12-08 22:14:11 +01006143 if (x <= MAX_UNICODE) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006144 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006145 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006146 }
6147 else {
Christian Heimesfe337bf2008-03-23 21:54:12 +00006148 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006149 if (unicode_decode_call_errorhandler_writer(
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006150 errors, &errorHandler,
6151 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00006152 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006153 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006154 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006155 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006156 nextByte:
6157 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006158 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006159 Py_XDECREF(errorHandler);
6160 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006161 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006162
Benjamin Peterson29060642009-01-31 22:14:21 +00006163 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006164 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006165 Py_XDECREF(errorHandler);
6166 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006167 return NULL;
6168}
6169
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006170
Alexander Belopolsky40018472011-02-26 01:02:56 +00006171PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006172PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006173{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006174 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006175 char *p;
6176 char *q;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006177 Py_ssize_t expandsize, pos;
6178 int kind;
6179 void *data;
6180 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006181
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006182 if (!PyUnicode_Check(unicode)) {
6183 PyErr_BadArgument();
6184 return NULL;
6185 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05006186 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006187 return NULL;
6188 kind = PyUnicode_KIND(unicode);
6189 data = PyUnicode_DATA(unicode);
6190 len = PyUnicode_GET_LENGTH(unicode);
Benjamin Peterson1518e872011-11-23 10:44:52 -06006191 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6192 bytes, and 1 byte characters 4. */
6193 expandsize = kind * 2 + 2;
Victor Stinner0e368262011-11-10 20:12:49 +01006194
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006195 if (len > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006196 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00006197
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006198 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006199 if (repr == NULL)
6200 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006201 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006202 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006203
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006204 p = q = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006205 for (pos = 0; pos < len; pos++) {
6206 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006207 /* Map 32-bit characters to '\Uxxxxxxxx' */
6208 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01006209 assert(ch <= MAX_UNICODE);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006210 *p++ = '\\';
6211 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006212 *p++ = Py_hexdigits[(ch >> 28) & 0xf];
6213 *p++ = Py_hexdigits[(ch >> 24) & 0xf];
6214 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6215 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6216 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6217 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6218 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6219 *p++ = Py_hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00006220 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006221 /* Map 16-bit characters to '\uxxxx' */
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006222 else if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006223 *p++ = '\\';
6224 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006225 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6226 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6227 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6228 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006229 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006230 /* Copy everything else as-is */
6231 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00006232 *p++ = (char) ch;
6233 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006234
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006235 assert(p > q);
6236 if (_PyBytes_Resize(&repr, p - q) < 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006237 return NULL;
6238 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006239}
6240
Alexander Belopolsky40018472011-02-26 01:02:56 +00006241PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006242PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6243 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006244{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006245 PyObject *result;
6246 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6247 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006248 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006249 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6250 Py_DECREF(tmp);
6251 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006252}
6253
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006254/* --- Unicode Internal Codec ------------------------------------------- */
6255
Alexander Belopolsky40018472011-02-26 01:02:56 +00006256PyObject *
6257_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006258 Py_ssize_t size,
6259 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006260{
6261 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006262 Py_ssize_t startinpos;
6263 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006264 _PyUnicodeWriter writer;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006265 const char *end;
6266 const char *reason;
6267 PyObject *errorHandler = NULL;
6268 PyObject *exc = NULL;
6269
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006270 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02006271 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006272 1))
6273 return NULL;
6274
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006275 if (size == 0)
6276 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006277
Victor Stinner8f674cc2013-04-17 23:02:17 +02006278 _PyUnicodeWriter_Init(&writer);
6279 if (size / Py_UNICODE_SIZE > PY_SSIZE_T_MAX - 1) {
6280 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00006281 goto onError;
Victor Stinner8f674cc2013-04-17 23:02:17 +02006282 }
6283 writer.min_length = (size + (Py_UNICODE_SIZE - 1)) / Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006284
Victor Stinner8f674cc2013-04-17 23:02:17 +02006285 end = s + size;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006286 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006287 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006288 Py_UCS4 ch;
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006289 if (end - s < Py_UNICODE_SIZE) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006290 endinpos = end-starts;
6291 reason = "truncated input";
6292 goto error;
6293 }
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006294 /* We copy the raw representation one byte at a time because the
6295 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006296 ((char *) &uch)[0] = s[0];
6297 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006298#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006299 ((char *) &uch)[2] = s[2];
6300 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006301#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006302 ch = uch;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006303#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006304 /* We have to sanity check the raw data, otherwise doom looms for
6305 some malformed UCS-4 data. */
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006306 if (ch > 0x10ffff) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006307 endinpos = s - starts + Py_UNICODE_SIZE;
6308 reason = "illegal code point (> 0x10FFFF)";
6309 goto error;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006310 }
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006311#endif
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006312 s += Py_UNICODE_SIZE;
6313#ifndef Py_UNICODE_WIDE
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006314 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && end - s >= Py_UNICODE_SIZE)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006315 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006316 Py_UNICODE uch2;
6317 ((char *) &uch2)[0] = s[0];
6318 ((char *) &uch2)[1] = s[1];
Victor Stinner551ac952011-11-29 22:58:13 +01006319 if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006320 {
Victor Stinner551ac952011-11-29 22:58:13 +01006321 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006322 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006323 }
6324 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006325#endif
6326
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006327 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006328 goto onError;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006329 continue;
6330
6331 error:
6332 startinpos = s - starts;
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006333 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006334 errors, &errorHandler,
6335 "unicode_internal", reason,
6336 &starts, &end, &startinpos, &endinpos, &exc, &s,
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006337 &writer))
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006338 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006339 }
6340
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006341 Py_XDECREF(errorHandler);
6342 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006343 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006344
Benjamin Peterson29060642009-01-31 22:14:21 +00006345 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006346 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006347 Py_XDECREF(errorHandler);
6348 Py_XDECREF(exc);
6349 return NULL;
6350}
6351
Guido van Rossumd57fd912000-03-10 22:53:23 +00006352/* --- Latin-1 Codec ------------------------------------------------------ */
6353
Alexander Belopolsky40018472011-02-26 01:02:56 +00006354PyObject *
6355PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006356 Py_ssize_t size,
6357 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006358{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006359 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006360 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006361}
6362
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006363/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006364static void
6365make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006366 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006367 PyObject *unicode,
6368 Py_ssize_t startpos, Py_ssize_t endpos,
6369 const char *reason)
6370{
6371 if (*exceptionObject == NULL) {
6372 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006373 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006374 encoding, unicode, startpos, endpos, reason);
6375 }
6376 else {
6377 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6378 goto onError;
6379 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6380 goto onError;
6381 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6382 goto onError;
6383 return;
6384 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02006385 Py_CLEAR(*exceptionObject);
Martin v. Löwis9e816682011-11-02 12:45:42 +01006386 }
6387}
6388
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006389/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006390static void
6391raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006392 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006393 PyObject *unicode,
6394 Py_ssize_t startpos, Py_ssize_t endpos,
6395 const char *reason)
6396{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006397 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006398 encoding, unicode, startpos, endpos, reason);
6399 if (*exceptionObject != NULL)
6400 PyCodec_StrictErrors(*exceptionObject);
6401}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006402
6403/* error handling callback helper:
6404 build arguments, call the callback and check the arguments,
6405 put the result into newpos and return the replacement string, which
6406 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006407static PyObject *
6408unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006409 PyObject **errorHandler,
6410 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006411 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006412 Py_ssize_t startpos, Py_ssize_t endpos,
6413 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006414{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006415 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006416 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006417 PyObject *restuple;
6418 PyObject *resunicode;
6419
6420 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006421 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006422 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006423 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006424 }
6425
Benjamin Petersonbac79492012-01-14 13:34:47 -05006426 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006427 return NULL;
6428 len = PyUnicode_GET_LENGTH(unicode);
6429
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006430 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006431 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006432 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006433 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006434
6435 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006436 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006437 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006438 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006439 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006440 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006441 Py_DECREF(restuple);
6442 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006443 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006444 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006445 &resunicode, newpos)) {
6446 Py_DECREF(restuple);
6447 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006448 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006449 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6450 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6451 Py_DECREF(restuple);
6452 return NULL;
6453 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006454 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006455 *newpos = len + *newpos;
6456 if (*newpos<0 || *newpos>len) {
Victor Stinnera33bce02014-07-04 22:47:46 +02006457 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006458 Py_DECREF(restuple);
6459 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006460 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006461 Py_INCREF(resunicode);
6462 Py_DECREF(restuple);
6463 return resunicode;
6464}
6465
Alexander Belopolsky40018472011-02-26 01:02:56 +00006466static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006467unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006468 const char *errors,
Victor Stinnerfcd96532011-11-04 00:28:50 +01006469 unsigned int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006470{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006471 /* input state */
6472 Py_ssize_t pos=0, size;
6473 int kind;
6474 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006475 /* output object */
6476 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006477 /* pointer into the output */
6478 char *str;
6479 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006480 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006481 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6482 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006483 PyObject *errorHandler = NULL;
6484 PyObject *exc = NULL;
6485 /* the following variable is used for caching string comparisons
6486 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6487 int known_errorHandler = -1;
6488
Benjamin Petersonbac79492012-01-14 13:34:47 -05006489 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006490 return NULL;
6491 size = PyUnicode_GET_LENGTH(unicode);
6492 kind = PyUnicode_KIND(unicode);
6493 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006494 /* allocate enough for a simple encoding without
6495 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006496 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006497 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006498 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006499 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006500 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006501 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006502 ressize = size;
6503
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006504 while (pos < size) {
6505 Py_UCS4 c = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006506
Benjamin Peterson29060642009-01-31 22:14:21 +00006507 /* can we encode this? */
6508 if (c<limit) {
6509 /* no overflow check, because we know that the space is enough */
6510 *str++ = (char)c;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006511 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006512 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006513 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006514 Py_ssize_t requiredsize;
6515 PyObject *repunicode;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006516 Py_ssize_t repsize, newpos, respos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006517 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006518 Py_ssize_t collstart = pos;
6519 Py_ssize_t collend = pos;
Benjamin Peterson29060642009-01-31 22:14:21 +00006520 /* find all unecodable characters */
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006521 while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006522 ++collend;
6523 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6524 if (known_errorHandler==-1) {
6525 if ((errors==NULL) || (!strcmp(errors, "strict")))
6526 known_errorHandler = 1;
6527 else if (!strcmp(errors, "replace"))
6528 known_errorHandler = 2;
6529 else if (!strcmp(errors, "ignore"))
6530 known_errorHandler = 3;
6531 else if (!strcmp(errors, "xmlcharrefreplace"))
6532 known_errorHandler = 4;
6533 else
6534 known_errorHandler = 0;
6535 }
6536 switch (known_errorHandler) {
6537 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006538 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006539 goto onError;
6540 case 2: /* replace */
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006541 while (collstart++ < collend)
Benjamin Peterson29060642009-01-31 22:14:21 +00006542 *str++ = '?'; /* fall through */
6543 case 3: /* ignore */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006544 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006545 break;
6546 case 4: /* xmlcharrefreplace */
6547 respos = str - PyBytes_AS_STRING(res);
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006548 requiredsize = respos;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006549 /* determine replacement size */
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006550 for (i = collstart; i < collend; ++i) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006551 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006552 Py_ssize_t incr;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006553 if (ch < 10)
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006554 incr = 2+1+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006555 else if (ch < 100)
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006556 incr = 2+2+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006557 else if (ch < 1000)
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006558 incr = 2+3+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006559 else if (ch < 10000)
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006560 incr = 2+4+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006561 else if (ch < 100000)
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006562 incr = 2+5+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006563 else if (ch < 1000000)
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006564 incr = 2+6+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006565 else {
Victor Stinner8faf8212011-12-08 22:14:11 +01006566 assert(ch <= MAX_UNICODE);
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006567 incr = 2+7+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006568 }
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006569 if (requiredsize > PY_SSIZE_T_MAX - incr)
6570 goto overflow;
6571 requiredsize += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +00006572 }
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006573 if (requiredsize > PY_SSIZE_T_MAX - (size - collend))
6574 goto overflow;
6575 requiredsize += size - collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006576 if (requiredsize > ressize) {
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006577 if (ressize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*ressize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006578 requiredsize = 2*ressize;
6579 if (_PyBytes_Resize(&res, requiredsize))
6580 goto onError;
6581 str = PyBytes_AS_STRING(res) + respos;
6582 ressize = requiredsize;
6583 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006584 /* generate replacement */
6585 for (i = collstart; i < collend; ++i) {
6586 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
Benjamin Peterson29060642009-01-31 22:14:21 +00006587 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006588 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006589 break;
6590 default:
6591 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006592 encoding, reason, unicode, &exc,
6593 collstart, collend, &newpos);
6594 if (repunicode == NULL || (PyUnicode_Check(repunicode) &&
Benjamin Petersonbac79492012-01-14 13:34:47 -05006595 PyUnicode_READY(repunicode) == -1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006596 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006597 if (PyBytes_Check(repunicode)) {
6598 /* Directly copy bytes result to output. */
6599 repsize = PyBytes_Size(repunicode);
6600 if (repsize > 1) {
6601 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006602 respos = str - PyBytes_AS_STRING(res);
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006603 if (ressize > PY_SSIZE_T_MAX - repsize - 1) {
6604 Py_DECREF(repunicode);
6605 goto overflow;
6606 }
Martin v. Löwis011e8422009-05-05 04:43:17 +00006607 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6608 Py_DECREF(repunicode);
6609 goto onError;
6610 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006611 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006612 ressize += repsize-1;
6613 }
6614 memcpy(str, PyBytes_AsString(repunicode), repsize);
6615 str += repsize;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006616 pos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006617 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006618 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006619 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006620 /* need more space? (at least enough for what we
6621 have+the replacement+the rest of the string, so
6622 we won't have to check space for encodable characters) */
6623 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006624 repsize = PyUnicode_GET_LENGTH(repunicode);
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006625 requiredsize = respos;
6626 if (requiredsize > PY_SSIZE_T_MAX - repsize)
6627 goto overflow;
6628 requiredsize += repsize;
6629 if (requiredsize > PY_SSIZE_T_MAX - (size - collend))
6630 goto overflow;
6631 requiredsize += size - collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006632 if (requiredsize > ressize) {
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006633 if (ressize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*ressize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006634 requiredsize = 2*ressize;
6635 if (_PyBytes_Resize(&res, requiredsize)) {
6636 Py_DECREF(repunicode);
6637 goto onError;
6638 }
6639 str = PyBytes_AS_STRING(res) + respos;
6640 ressize = requiredsize;
6641 }
6642 /* check if there is anything unencodable in the replacement
6643 and copy it to the output */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006644 for (i = 0; repsize-->0; ++i, ++str) {
6645 c = PyUnicode_READ_CHAR(repunicode, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00006646 if (c >= limit) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006647 raise_encode_exception(&exc, encoding, unicode,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006648 pos, pos+1, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006649 Py_DECREF(repunicode);
6650 goto onError;
6651 }
6652 *str = (char)c;
6653 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006654 pos = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006655 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006656 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006657 }
6658 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006659 /* Resize if we allocated to much */
6660 size = str - PyBytes_AS_STRING(res);
6661 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006662 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006663 if (_PyBytes_Resize(&res, size) < 0)
6664 goto onError;
6665 }
6666
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006667 Py_XDECREF(errorHandler);
6668 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006669 return res;
6670
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006671 overflow:
6672 PyErr_SetString(PyExc_OverflowError,
6673 "encoded result is too long for a Python string");
6674
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006675 onError:
6676 Py_XDECREF(res);
6677 Py_XDECREF(errorHandler);
6678 Py_XDECREF(exc);
6679 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006680}
6681
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006682/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006683PyObject *
6684PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006685 Py_ssize_t size,
6686 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006687{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006688 PyObject *result;
6689 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6690 if (unicode == NULL)
6691 return NULL;
6692 result = unicode_encode_ucs1(unicode, errors, 256);
6693 Py_DECREF(unicode);
6694 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006695}
6696
Alexander Belopolsky40018472011-02-26 01:02:56 +00006697PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006698_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006699{
6700 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006701 PyErr_BadArgument();
6702 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006703 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006704 if (PyUnicode_READY(unicode) == -1)
6705 return NULL;
6706 /* Fast path: if it is a one-byte string, construct
6707 bytes object directly. */
6708 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6709 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6710 PyUnicode_GET_LENGTH(unicode));
6711 /* Non-Latin-1 characters present. Defer to above function to
6712 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006713 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006714}
6715
6716PyObject*
6717PyUnicode_AsLatin1String(PyObject *unicode)
6718{
6719 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006720}
6721
6722/* --- 7-bit ASCII Codec -------------------------------------------------- */
6723
Alexander Belopolsky40018472011-02-26 01:02:56 +00006724PyObject *
6725PyUnicode_DecodeASCII(const char *s,
6726 Py_ssize_t size,
6727 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006728{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006729 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006730 _PyUnicodeWriter writer;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006731 int kind;
6732 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006733 Py_ssize_t startinpos;
6734 Py_ssize_t endinpos;
6735 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006736 const char *e;
6737 PyObject *errorHandler = NULL;
6738 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006739
Guido van Rossumd57fd912000-03-10 22:53:23 +00006740 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02006741 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006742
Guido van Rossumd57fd912000-03-10 22:53:23 +00006743 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006744 if (size == 1 && (unsigned char)s[0] < 128)
6745 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006746
Victor Stinner8f674cc2013-04-17 23:02:17 +02006747 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02006748 writer.min_length = size;
6749 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0)
Victor Stinner8f674cc2013-04-17 23:02:17 +02006750 return NULL;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006751
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006752 e = s + size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006753 data = writer.data;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006754 outpos = ascii_decode(s, e, (Py_UCS1 *)data);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006755 writer.pos = outpos;
6756 if (writer.pos == size)
6757 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006758
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006759 s += writer.pos;
6760 kind = writer.kind;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006761 while (s < e) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02006762 unsigned char c = (unsigned char)*s;
Benjamin Peterson29060642009-01-31 22:14:21 +00006763 if (c < 128) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006764 PyUnicode_WRITE(kind, data, writer.pos, c);
6765 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00006766 ++s;
6767 }
6768 else {
6769 startinpos = s-starts;
6770 endinpos = startinpos + 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006771 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00006772 errors, &errorHandler,
6773 "ascii", "ordinal not in range(128)",
6774 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006775 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006776 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006777 kind = writer.kind;
6778 data = writer.data;
Benjamin Peterson29060642009-01-31 22:14:21 +00006779 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006780 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006781 Py_XDECREF(errorHandler);
6782 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006783 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006784
Benjamin Peterson29060642009-01-31 22:14:21 +00006785 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006786 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006787 Py_XDECREF(errorHandler);
6788 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006789 return NULL;
6790}
6791
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006792/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006793PyObject *
6794PyUnicode_EncodeASCII(const Py_UNICODE *p,
6795 Py_ssize_t size,
6796 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006797{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006798 PyObject *result;
6799 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6800 if (unicode == NULL)
6801 return NULL;
6802 result = unicode_encode_ucs1(unicode, errors, 128);
6803 Py_DECREF(unicode);
6804 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006805}
6806
Alexander Belopolsky40018472011-02-26 01:02:56 +00006807PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006808_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006809{
6810 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006811 PyErr_BadArgument();
6812 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006813 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006814 if (PyUnicode_READY(unicode) == -1)
6815 return NULL;
6816 /* Fast path: if it is an ASCII-only string, construct bytes object
6817 directly. Else defer to above function to raise the exception. */
Victor Stinneraf037572013-04-14 18:44:10 +02006818 if (PyUnicode_IS_ASCII(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006819 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6820 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006821 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006822}
6823
6824PyObject *
6825PyUnicode_AsASCIIString(PyObject *unicode)
6826{
6827 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006828}
6829
Victor Stinner99b95382011-07-04 14:23:54 +02006830#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006831
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006832/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006833
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006834#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006835#define NEED_RETRY
6836#endif
6837
Victor Stinner3a50e702011-10-18 21:21:00 +02006838#ifndef WC_ERR_INVALID_CHARS
6839# define WC_ERR_INVALID_CHARS 0x0080
6840#endif
6841
6842static char*
6843code_page_name(UINT code_page, PyObject **obj)
6844{
6845 *obj = NULL;
6846 if (code_page == CP_ACP)
6847 return "mbcs";
6848 if (code_page == CP_UTF7)
6849 return "CP_UTF7";
6850 if (code_page == CP_UTF8)
6851 return "CP_UTF8";
6852
6853 *obj = PyBytes_FromFormat("cp%u", code_page);
6854 if (*obj == NULL)
6855 return NULL;
6856 return PyBytes_AS_STRING(*obj);
6857}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006858
Alexander Belopolsky40018472011-02-26 01:02:56 +00006859static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006860is_dbcs_lead_byte(UINT code_page, const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006861{
6862 const char *curr = s + offset;
Victor Stinner3a50e702011-10-18 21:21:00 +02006863 const char *prev;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006864
Victor Stinner3a50e702011-10-18 21:21:00 +02006865 if (!IsDBCSLeadByteEx(code_page, *curr))
6866 return 0;
6867
6868 prev = CharPrevExA(code_page, s, curr, 0);
6869 if (prev == curr)
6870 return 1;
6871 /* FIXME: This code is limited to "true" double-byte encodings,
6872 as it assumes an incomplete character consists of a single
6873 byte. */
6874 if (curr - prev == 2)
6875 return 1;
6876 if (!IsDBCSLeadByteEx(code_page, *prev))
6877 return 1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006878 return 0;
6879}
6880
Victor Stinner3a50e702011-10-18 21:21:00 +02006881static DWORD
6882decode_code_page_flags(UINT code_page)
6883{
6884 if (code_page == CP_UTF7) {
6885 /* The CP_UTF7 decoder only supports flags=0 */
6886 return 0;
6887 }
6888 else
6889 return MB_ERR_INVALID_CHARS;
6890}
6891
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006892/*
Victor Stinner3a50e702011-10-18 21:21:00 +02006893 * Decode a byte string from a Windows code page into unicode object in strict
6894 * mode.
6895 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02006896 * Returns consumed size if succeed, returns -2 on decode error, or raise an
6897 * OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006898 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006899static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006900decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006901 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02006902 const char *in,
6903 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006904{
Victor Stinner3a50e702011-10-18 21:21:00 +02006905 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01006906 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02006907 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006908
6909 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02006910 assert(insize > 0);
6911 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
6912 if (outsize <= 0)
6913 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006914
6915 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006916 /* Create unicode object */
Victor Stinnerab595942011-12-17 04:59:06 +01006917 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01006918 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00006919 if (*v == NULL)
6920 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006921 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006922 }
6923 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006924 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02006925 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner16e6a802011-12-12 13:24:15 +01006926 if (unicode_resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006927 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006928 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006929 }
6930
6931 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02006932 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
6933 if (outsize <= 0)
6934 goto error;
6935 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00006936
Victor Stinner3a50e702011-10-18 21:21:00 +02006937error:
6938 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
6939 return -2;
6940 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00006941 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006942}
6943
Victor Stinner3a50e702011-10-18 21:21:00 +02006944/*
6945 * Decode a byte string from a code page into unicode object with an error
6946 * handler.
6947 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02006948 * Returns consumed size if succeed, or raise an OSError or
Victor Stinner3a50e702011-10-18 21:21:00 +02006949 * UnicodeDecodeError exception and returns -1 on error.
6950 */
6951static int
6952decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006953 PyObject **v,
6954 const char *in, const int size,
Victor Stinner3a50e702011-10-18 21:21:00 +02006955 const char *errors)
6956{
6957 const char *startin = in;
6958 const char *endin = in + size;
6959 const DWORD flags = decode_code_page_flags(code_page);
6960 /* Ideally, we should get reason from FormatMessage. This is the Windows
6961 2000 English version of the message. */
6962 const char *reason = "No mapping for the Unicode character exists "
6963 "in the target code page.";
6964 /* each step cannot decode more than 1 character, but a character can be
6965 represented as a surrogate pair */
6966 wchar_t buffer[2], *startout, *out;
Victor Stinner9f067f42013-06-05 00:21:31 +02006967 int insize;
6968 Py_ssize_t outsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02006969 PyObject *errorHandler = NULL;
6970 PyObject *exc = NULL;
6971 PyObject *encoding_obj = NULL;
6972 char *encoding;
6973 DWORD err;
6974 int ret = -1;
6975
6976 assert(size > 0);
6977
6978 encoding = code_page_name(code_page, &encoding_obj);
6979 if (encoding == NULL)
6980 return -1;
6981
6982 if (errors == NULL || strcmp(errors, "strict") == 0) {
6983 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
6984 UnicodeDecodeError. */
6985 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
6986 if (exc != NULL) {
6987 PyCodec_StrictErrors(exc);
6988 Py_CLEAR(exc);
6989 }
6990 goto error;
6991 }
6992
6993 if (*v == NULL) {
6994 /* Create unicode object */
6995 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6996 PyErr_NoMemory();
6997 goto error;
6998 }
Victor Stinnerab595942011-12-17 04:59:06 +01006999 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01007000 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02007001 if (*v == NULL)
7002 goto error;
7003 startout = PyUnicode_AS_UNICODE(*v);
7004 }
7005 else {
7006 /* Extend unicode object */
7007 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
7008 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7009 PyErr_NoMemory();
7010 goto error;
7011 }
Victor Stinner16e6a802011-12-12 13:24:15 +01007012 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007013 goto error;
7014 startout = PyUnicode_AS_UNICODE(*v) + n;
7015 }
7016
7017 /* Decode the byte string character per character */
7018 out = startout;
7019 while (in < endin)
7020 {
7021 /* Decode a character */
7022 insize = 1;
7023 do
7024 {
7025 outsize = MultiByteToWideChar(code_page, flags,
7026 in, insize,
7027 buffer, Py_ARRAY_LENGTH(buffer));
7028 if (outsize > 0)
7029 break;
7030 err = GetLastError();
7031 if (err != ERROR_NO_UNICODE_TRANSLATION
7032 && err != ERROR_INSUFFICIENT_BUFFER)
7033 {
7034 PyErr_SetFromWindowsErr(0);
7035 goto error;
7036 }
7037 insize++;
7038 }
7039 /* 4=maximum length of a UTF-8 sequence */
7040 while (insize <= 4 && (in + insize) <= endin);
7041
7042 if (outsize <= 0) {
7043 Py_ssize_t startinpos, endinpos, outpos;
7044
7045 startinpos = in - startin;
7046 endinpos = startinpos + 1;
7047 outpos = out - PyUnicode_AS_UNICODE(*v);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007048 if (unicode_decode_call_errorhandler_wchar(
Victor Stinner3a50e702011-10-18 21:21:00 +02007049 errors, &errorHandler,
7050 encoding, reason,
7051 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01007052 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007053 {
7054 goto error;
7055 }
Victor Stinner596a6c42011-11-09 00:02:18 +01007056 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007057 }
7058 else {
7059 in += insize;
7060 memcpy(out, buffer, outsize * sizeof(wchar_t));
7061 out += outsize;
7062 }
7063 }
7064
7065 /* write a NUL character at the end */
7066 *out = 0;
7067
7068 /* Extend unicode object */
7069 outsize = out - startout;
7070 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner16e6a802011-12-12 13:24:15 +01007071 if (unicode_resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007072 goto error;
Victor Stinner76a31a62011-11-04 00:05:13 +01007073 ret = size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007074
7075error:
7076 Py_XDECREF(encoding_obj);
7077 Py_XDECREF(errorHandler);
7078 Py_XDECREF(exc);
7079 return ret;
7080}
7081
Victor Stinner3a50e702011-10-18 21:21:00 +02007082static PyObject *
7083decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007084 const char *s, Py_ssize_t size,
7085 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007086{
Victor Stinner76a31a62011-11-04 00:05:13 +01007087 PyObject *v = NULL;
7088 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007089
Victor Stinner3a50e702011-10-18 21:21:00 +02007090 if (code_page < 0) {
7091 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7092 return NULL;
7093 }
7094
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007095 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007096 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007097
Victor Stinner76a31a62011-11-04 00:05:13 +01007098 do
7099 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007100#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01007101 if (size > INT_MAX) {
7102 chunk_size = INT_MAX;
7103 final = 0;
7104 done = 0;
7105 }
7106 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007107#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007108 {
7109 chunk_size = (int)size;
7110 final = (consumed == NULL);
7111 done = 1;
7112 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007113
Victor Stinner76a31a62011-11-04 00:05:13 +01007114 /* Skip trailing lead-byte unless 'final' is set */
7115 if (!final && is_dbcs_lead_byte(code_page, s, chunk_size - 1))
7116 --chunk_size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007117
Victor Stinner76a31a62011-11-04 00:05:13 +01007118 if (chunk_size == 0 && done) {
7119 if (v != NULL)
7120 break;
Serhiy Storchaka678db842013-01-26 12:16:36 +02007121 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner76a31a62011-11-04 00:05:13 +01007122 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007123
Victor Stinner76a31a62011-11-04 00:05:13 +01007124
7125 converted = decode_code_page_strict(code_page, &v,
7126 s, chunk_size);
7127 if (converted == -2)
7128 converted = decode_code_page_errors(code_page, &v,
7129 s, chunk_size,
7130 errors);
7131 assert(converted != 0);
7132
7133 if (converted < 0) {
7134 Py_XDECREF(v);
7135 return NULL;
7136 }
7137
7138 if (consumed)
7139 *consumed += converted;
7140
7141 s += converted;
7142 size -= converted;
7143 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007144
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007145 return unicode_result(v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007146}
7147
Alexander Belopolsky40018472011-02-26 01:02:56 +00007148PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007149PyUnicode_DecodeCodePageStateful(int code_page,
7150 const char *s,
7151 Py_ssize_t size,
7152 const char *errors,
7153 Py_ssize_t *consumed)
7154{
7155 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7156}
7157
7158PyObject *
7159PyUnicode_DecodeMBCSStateful(const char *s,
7160 Py_ssize_t size,
7161 const char *errors,
7162 Py_ssize_t *consumed)
7163{
7164 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7165}
7166
7167PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007168PyUnicode_DecodeMBCS(const char *s,
7169 Py_ssize_t size,
7170 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007171{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007172 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7173}
7174
Victor Stinner3a50e702011-10-18 21:21:00 +02007175static DWORD
7176encode_code_page_flags(UINT code_page, const char *errors)
7177{
7178 if (code_page == CP_UTF8) {
7179 if (winver.dwMajorVersion >= 6)
7180 /* CP_UTF8 supports WC_ERR_INVALID_CHARS on Windows Vista
7181 and later */
7182 return WC_ERR_INVALID_CHARS;
7183 else
7184 /* CP_UTF8 only supports flags=0 on Windows older than Vista */
7185 return 0;
7186 }
7187 else if (code_page == CP_UTF7) {
7188 /* CP_UTF7 only supports flags=0 */
7189 return 0;
7190 }
7191 else {
7192 if (errors != NULL && strcmp(errors, "replace") == 0)
7193 return 0;
7194 else
7195 return WC_NO_BEST_FIT_CHARS;
7196 }
7197}
7198
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007199/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007200 * Encode a Unicode string to a Windows code page into a byte string in strict
7201 * mode.
7202 *
7203 * Returns consumed characters if succeed, returns -2 on encode error, or raise
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007204 * an OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007205 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007206static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007207encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007208 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007209 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007210{
Victor Stinner554f3f02010-06-16 23:33:54 +00007211 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007212 BOOL *pusedDefaultChar = &usedDefaultChar;
7213 int outsize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007214 PyObject *exc = NULL;
Victor Stinner24729f32011-11-10 20:31:37 +01007215 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007216 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007217 const DWORD flags = encode_code_page_flags(code_page, NULL);
7218 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007219 /* Create a substring so that we can get the UTF-16 representation
7220 of just the slice under consideration. */
7221 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007222
Martin v. Löwis3d325192011-11-04 18:23:06 +01007223 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007224
Victor Stinner3a50e702011-10-18 21:21:00 +02007225 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007226 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007227 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007228 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007229
Victor Stinner2fc507f2011-11-04 20:06:39 +01007230 substring = PyUnicode_Substring(unicode, offset, offset+len);
7231 if (substring == NULL)
7232 return -1;
7233 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7234 if (p == NULL) {
7235 Py_DECREF(substring);
7236 return -1;
7237 }
Victor Stinner9f067f42013-06-05 00:21:31 +02007238 assert(size <= INT_MAX);
Martin v. Löwis3d325192011-11-04 18:23:06 +01007239
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007240 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007241 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007242 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007243 NULL, 0,
7244 NULL, pusedDefaultChar);
7245 if (outsize <= 0)
7246 goto error;
7247 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007248 if (pusedDefaultChar && *pusedDefaultChar) {
7249 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007250 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007251 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007252
Victor Stinner3a50e702011-10-18 21:21:00 +02007253 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007254 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007255 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007256 if (*outbytes == NULL) {
7257 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007258 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007259 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007260 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007261 }
7262 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007263 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007264 const Py_ssize_t n = PyBytes_Size(*outbytes);
7265 if (outsize > PY_SSIZE_T_MAX - n) {
7266 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007267 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007268 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007269 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007270 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7271 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007272 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007273 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007274 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007275 }
7276
7277 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007278 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007279 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007280 out, outsize,
7281 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007282 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007283 if (outsize <= 0)
7284 goto error;
7285 if (pusedDefaultChar && *pusedDefaultChar)
7286 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007287 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007288
Victor Stinner3a50e702011-10-18 21:21:00 +02007289error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007290 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007291 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7292 return -2;
7293 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007294 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007295}
7296
Victor Stinner3a50e702011-10-18 21:21:00 +02007297/*
Serhiy Storchakad65c9492015-11-02 14:10:23 +02007298 * Encode a Unicode string to a Windows code page into a byte string using an
Victor Stinner3a50e702011-10-18 21:21:00 +02007299 * error handler.
7300 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007301 * Returns consumed characters if succeed, or raise an OSError and returns
Victor Stinner3a50e702011-10-18 21:21:00 +02007302 * -1 on other error.
7303 */
7304static int
7305encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007306 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007307 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007308{
Victor Stinner3a50e702011-10-18 21:21:00 +02007309 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007310 Py_ssize_t pos = unicode_offset;
7311 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007312 /* Ideally, we should get reason from FormatMessage. This is the Windows
7313 2000 English version of the message. */
7314 const char *reason = "invalid character";
7315 /* 4=maximum length of a UTF-8 sequence */
7316 char buffer[4];
7317 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7318 Py_ssize_t outsize;
7319 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007320 PyObject *errorHandler = NULL;
7321 PyObject *exc = NULL;
7322 PyObject *encoding_obj = NULL;
7323 char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007324 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007325 PyObject *rep;
7326 int ret = -1;
7327
7328 assert(insize > 0);
7329
7330 encoding = code_page_name(code_page, &encoding_obj);
7331 if (encoding == NULL)
7332 return -1;
7333
7334 if (errors == NULL || strcmp(errors, "strict") == 0) {
7335 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7336 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007337 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007338 if (exc != NULL) {
7339 PyCodec_StrictErrors(exc);
7340 Py_DECREF(exc);
7341 }
7342 Py_XDECREF(encoding_obj);
7343 return -1;
7344 }
7345
7346 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7347 pusedDefaultChar = &usedDefaultChar;
7348 else
7349 pusedDefaultChar = NULL;
7350
7351 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7352 PyErr_NoMemory();
7353 goto error;
7354 }
7355 outsize = insize * Py_ARRAY_LENGTH(buffer);
7356
7357 if (*outbytes == NULL) {
7358 /* Create string object */
7359 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7360 if (*outbytes == NULL)
7361 goto error;
7362 out = PyBytes_AS_STRING(*outbytes);
7363 }
7364 else {
7365 /* Extend string object */
7366 Py_ssize_t n = PyBytes_Size(*outbytes);
7367 if (n > PY_SSIZE_T_MAX - outsize) {
7368 PyErr_NoMemory();
7369 goto error;
7370 }
7371 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7372 goto error;
7373 out = PyBytes_AS_STRING(*outbytes) + n;
7374 }
7375
7376 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007377 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007378 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007379 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7380 wchar_t chars[2];
7381 int charsize;
7382 if (ch < 0x10000) {
7383 chars[0] = (wchar_t)ch;
7384 charsize = 1;
7385 }
7386 else {
Victor Stinner76df43d2012-10-30 01:42:39 +01007387 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7388 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007389 charsize = 2;
7390 }
7391
Victor Stinner3a50e702011-10-18 21:21:00 +02007392 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007393 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007394 buffer, Py_ARRAY_LENGTH(buffer),
7395 NULL, pusedDefaultChar);
7396 if (outsize > 0) {
7397 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7398 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007399 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007400 memcpy(out, buffer, outsize);
7401 out += outsize;
7402 continue;
7403 }
7404 }
7405 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7406 PyErr_SetFromWindowsErr(0);
7407 goto error;
7408 }
7409
Victor Stinner3a50e702011-10-18 21:21:00 +02007410 rep = unicode_encode_call_errorhandler(
7411 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007412 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007413 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007414 if (rep == NULL)
7415 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007416 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007417
7418 if (PyBytes_Check(rep)) {
7419 outsize = PyBytes_GET_SIZE(rep);
7420 if (outsize != 1) {
7421 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7422 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7423 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7424 Py_DECREF(rep);
7425 goto error;
7426 }
7427 out = PyBytes_AS_STRING(*outbytes) + offset;
7428 }
7429 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7430 out += outsize;
7431 }
7432 else {
7433 Py_ssize_t i;
7434 enum PyUnicode_Kind kind;
7435 void *data;
7436
Benjamin Petersonbac79492012-01-14 13:34:47 -05007437 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007438 Py_DECREF(rep);
7439 goto error;
7440 }
7441
7442 outsize = PyUnicode_GET_LENGTH(rep);
7443 if (outsize != 1) {
7444 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7445 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7446 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7447 Py_DECREF(rep);
7448 goto error;
7449 }
7450 out = PyBytes_AS_STRING(*outbytes) + offset;
7451 }
7452 kind = PyUnicode_KIND(rep);
7453 data = PyUnicode_DATA(rep);
7454 for (i=0; i < outsize; i++) {
7455 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7456 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007457 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007458 encoding, unicode,
7459 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007460 "unable to encode error handler result to ASCII");
7461 Py_DECREF(rep);
7462 goto error;
7463 }
7464 *out = (unsigned char)ch;
7465 out++;
7466 }
7467 }
7468 Py_DECREF(rep);
7469 }
7470 /* write a NUL byte */
7471 *out = 0;
7472 outsize = out - PyBytes_AS_STRING(*outbytes);
7473 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7474 if (_PyBytes_Resize(outbytes, outsize) < 0)
7475 goto error;
7476 ret = 0;
7477
7478error:
7479 Py_XDECREF(encoding_obj);
7480 Py_XDECREF(errorHandler);
7481 Py_XDECREF(exc);
7482 return ret;
7483}
7484
Victor Stinner3a50e702011-10-18 21:21:00 +02007485static PyObject *
7486encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007487 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007488 const char *errors)
7489{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007490 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007491 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007492 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007493 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007494
Benjamin Petersonbac79492012-01-14 13:34:47 -05007495 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007496 return NULL;
7497 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007498
Victor Stinner3a50e702011-10-18 21:21:00 +02007499 if (code_page < 0) {
7500 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7501 return NULL;
7502 }
7503
Martin v. Löwis3d325192011-11-04 18:23:06 +01007504 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007505 return PyBytes_FromStringAndSize(NULL, 0);
7506
Victor Stinner7581cef2011-11-03 22:32:33 +01007507 offset = 0;
7508 do
7509 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007510#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007511 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007512 chunks. */
7513 if (len > INT_MAX/2) {
7514 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007515 done = 0;
7516 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007517 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007518#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007519 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007520 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007521 done = 1;
7522 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007523
Victor Stinner76a31a62011-11-04 00:05:13 +01007524 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007525 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007526 errors);
7527 if (ret == -2)
7528 ret = encode_code_page_errors(code_page, &outbytes,
7529 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007530 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007531 if (ret < 0) {
7532 Py_XDECREF(outbytes);
7533 return NULL;
7534 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007535
Victor Stinner7581cef2011-11-03 22:32:33 +01007536 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007537 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007538 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007539
Victor Stinner3a50e702011-10-18 21:21:00 +02007540 return outbytes;
7541}
7542
7543PyObject *
7544PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7545 Py_ssize_t size,
7546 const char *errors)
7547{
Victor Stinner7581cef2011-11-03 22:32:33 +01007548 PyObject *unicode, *res;
7549 unicode = PyUnicode_FromUnicode(p, size);
7550 if (unicode == NULL)
7551 return NULL;
7552 res = encode_code_page(CP_ACP, unicode, errors);
7553 Py_DECREF(unicode);
7554 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007555}
7556
7557PyObject *
7558PyUnicode_EncodeCodePage(int code_page,
7559 PyObject *unicode,
7560 const char *errors)
7561{
Victor Stinner7581cef2011-11-03 22:32:33 +01007562 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007563}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007564
Alexander Belopolsky40018472011-02-26 01:02:56 +00007565PyObject *
7566PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007567{
7568 if (!PyUnicode_Check(unicode)) {
7569 PyErr_BadArgument();
7570 return NULL;
7571 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007572 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007573}
7574
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007575#undef NEED_RETRY
7576
Victor Stinner99b95382011-07-04 14:23:54 +02007577#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007578
Guido van Rossumd57fd912000-03-10 22:53:23 +00007579/* --- Character Mapping Codec -------------------------------------------- */
7580
Victor Stinnerfb161b12013-04-18 01:44:27 +02007581static int
7582charmap_decode_string(const char *s,
7583 Py_ssize_t size,
7584 PyObject *mapping,
7585 const char *errors,
7586 _PyUnicodeWriter *writer)
7587{
7588 const char *starts = s;
7589 const char *e;
7590 Py_ssize_t startinpos, endinpos;
7591 PyObject *errorHandler = NULL, *exc = NULL;
7592 Py_ssize_t maplen;
7593 enum PyUnicode_Kind mapkind;
7594 void *mapdata;
7595 Py_UCS4 x;
7596 unsigned char ch;
7597
7598 if (PyUnicode_READY(mapping) == -1)
7599 return -1;
7600
7601 maplen = PyUnicode_GET_LENGTH(mapping);
7602 mapdata = PyUnicode_DATA(mapping);
7603 mapkind = PyUnicode_KIND(mapping);
7604
7605 e = s + size;
7606
7607 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
7608 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
7609 * is disabled in encoding aliases, latin1 is preferred because
7610 * its implementation is faster. */
7611 Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata;
7612 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7613 Py_UCS4 maxchar = writer->maxchar;
7614
7615 assert (writer->kind == PyUnicode_1BYTE_KIND);
7616 while (s < e) {
7617 ch = *s;
7618 x = mapdata_ucs1[ch];
7619 if (x > maxchar) {
7620 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
7621 goto onError;
7622 maxchar = writer->maxchar;
7623 outdata = (Py_UCS1 *)writer->data;
7624 }
7625 outdata[writer->pos] = x;
7626 writer->pos++;
7627 ++s;
7628 }
7629 return 0;
7630 }
7631
7632 while (s < e) {
7633 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
7634 enum PyUnicode_Kind outkind = writer->kind;
7635 Py_UCS2 *mapdata_ucs2 = (Py_UCS2 *)mapdata;
7636 if (outkind == PyUnicode_1BYTE_KIND) {
7637 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7638 Py_UCS4 maxchar = writer->maxchar;
7639 while (s < e) {
7640 ch = *s;
7641 x = mapdata_ucs2[ch];
7642 if (x > maxchar)
7643 goto Error;
7644 outdata[writer->pos] = x;
7645 writer->pos++;
7646 ++s;
7647 }
7648 break;
7649 }
7650 else if (outkind == PyUnicode_2BYTE_KIND) {
7651 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
7652 while (s < e) {
7653 ch = *s;
7654 x = mapdata_ucs2[ch];
7655 if (x == 0xFFFE)
7656 goto Error;
7657 outdata[writer->pos] = x;
7658 writer->pos++;
7659 ++s;
7660 }
7661 break;
7662 }
7663 }
7664 ch = *s;
7665
7666 if (ch < maplen)
7667 x = PyUnicode_READ(mapkind, mapdata, ch);
7668 else
7669 x = 0xfffe; /* invalid value */
7670Error:
7671 if (x == 0xfffe)
7672 {
7673 /* undefined mapping */
7674 startinpos = s-starts;
7675 endinpos = startinpos+1;
7676 if (unicode_decode_call_errorhandler_writer(
7677 errors, &errorHandler,
7678 "charmap", "character maps to <undefined>",
7679 &starts, &e, &startinpos, &endinpos, &exc, &s,
7680 writer)) {
7681 goto onError;
7682 }
7683 continue;
7684 }
7685
7686 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
7687 goto onError;
7688 ++s;
7689 }
7690 Py_XDECREF(errorHandler);
7691 Py_XDECREF(exc);
7692 return 0;
7693
7694onError:
7695 Py_XDECREF(errorHandler);
7696 Py_XDECREF(exc);
7697 return -1;
7698}
7699
7700static int
7701charmap_decode_mapping(const char *s,
7702 Py_ssize_t size,
7703 PyObject *mapping,
7704 const char *errors,
7705 _PyUnicodeWriter *writer)
7706{
7707 const char *starts = s;
7708 const char *e;
7709 Py_ssize_t startinpos, endinpos;
7710 PyObject *errorHandler = NULL, *exc = NULL;
7711 unsigned char ch;
Victor Stinnerf4f24242013-05-07 01:01:31 +02007712 PyObject *key, *item = NULL;
Victor Stinnerfb161b12013-04-18 01:44:27 +02007713
7714 e = s + size;
7715
7716 while (s < e) {
7717 ch = *s;
7718
7719 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7720 key = PyLong_FromLong((long)ch);
7721 if (key == NULL)
7722 goto onError;
7723
7724 item = PyObject_GetItem(mapping, key);
7725 Py_DECREF(key);
7726 if (item == NULL) {
7727 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7728 /* No mapping found means: mapping is undefined. */
7729 PyErr_Clear();
7730 goto Undefined;
7731 } else
7732 goto onError;
7733 }
7734
7735 /* Apply mapping */
7736 if (item == Py_None)
7737 goto Undefined;
7738 if (PyLong_Check(item)) {
7739 long value = PyLong_AS_LONG(item);
7740 if (value == 0xFFFE)
7741 goto Undefined;
7742 if (value < 0 || value > MAX_UNICODE) {
7743 PyErr_Format(PyExc_TypeError,
7744 "character mapping must be in range(0x%lx)",
7745 (unsigned long)MAX_UNICODE + 1);
7746 goto onError;
7747 }
7748
7749 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7750 goto onError;
7751 }
7752 else if (PyUnicode_Check(item)) {
7753 if (PyUnicode_READY(item) == -1)
7754 goto onError;
7755 if (PyUnicode_GET_LENGTH(item) == 1) {
7756 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
7757 if (value == 0xFFFE)
7758 goto Undefined;
7759 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7760 goto onError;
7761 }
7762 else {
7763 writer->overallocate = 1;
7764 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
7765 goto onError;
7766 }
7767 }
7768 else {
7769 /* wrong return value */
7770 PyErr_SetString(PyExc_TypeError,
7771 "character mapping must return integer, None or str");
7772 goto onError;
7773 }
7774 Py_CLEAR(item);
7775 ++s;
7776 continue;
7777
7778Undefined:
7779 /* undefined mapping */
7780 Py_CLEAR(item);
7781 startinpos = s-starts;
7782 endinpos = startinpos+1;
7783 if (unicode_decode_call_errorhandler_writer(
7784 errors, &errorHandler,
7785 "charmap", "character maps to <undefined>",
7786 &starts, &e, &startinpos, &endinpos, &exc, &s,
7787 writer)) {
7788 goto onError;
7789 }
7790 }
7791 Py_XDECREF(errorHandler);
7792 Py_XDECREF(exc);
7793 return 0;
7794
7795onError:
7796 Py_XDECREF(item);
7797 Py_XDECREF(errorHandler);
7798 Py_XDECREF(exc);
7799 return -1;
7800}
7801
Alexander Belopolsky40018472011-02-26 01:02:56 +00007802PyObject *
7803PyUnicode_DecodeCharmap(const char *s,
7804 Py_ssize_t size,
7805 PyObject *mapping,
7806 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007807{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007808 _PyUnicodeWriter writer;
Tim Petersced69f82003-09-16 20:30:58 +00007809
Guido van Rossumd57fd912000-03-10 22:53:23 +00007810 /* Default to Latin-1 */
7811 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007812 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007813
Guido van Rossumd57fd912000-03-10 22:53:23 +00007814 if (size == 0)
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02007815 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner8f674cc2013-04-17 23:02:17 +02007816 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02007817 writer.min_length = size;
7818 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007819 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007820
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007821 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerfb161b12013-04-18 01:44:27 +02007822 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
7823 goto onError;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007824 }
7825 else {
Victor Stinnerfb161b12013-04-18 01:44:27 +02007826 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
7827 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007828 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007829 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00007830
Benjamin Peterson29060642009-01-31 22:14:21 +00007831 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007832 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007833 return NULL;
7834}
7835
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007836/* Charmap encoding: the lookup table */
7837
Alexander Belopolsky40018472011-02-26 01:02:56 +00007838struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00007839 PyObject_HEAD
7840 unsigned char level1[32];
7841 int count2, count3;
7842 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007843};
7844
7845static PyObject*
7846encoding_map_size(PyObject *obj, PyObject* args)
7847{
7848 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007849 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00007850 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007851}
7852
7853static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007854 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00007855 PyDoc_STR("Return the size (in bytes) of this object") },
7856 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007857};
7858
7859static void
7860encoding_map_dealloc(PyObject* o)
7861{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007862 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007863}
7864
7865static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007866 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007867 "EncodingMap", /*tp_name*/
7868 sizeof(struct encoding_map), /*tp_basicsize*/
7869 0, /*tp_itemsize*/
7870 /* methods */
7871 encoding_map_dealloc, /*tp_dealloc*/
7872 0, /*tp_print*/
7873 0, /*tp_getattr*/
7874 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00007875 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00007876 0, /*tp_repr*/
7877 0, /*tp_as_number*/
7878 0, /*tp_as_sequence*/
7879 0, /*tp_as_mapping*/
7880 0, /*tp_hash*/
7881 0, /*tp_call*/
7882 0, /*tp_str*/
7883 0, /*tp_getattro*/
7884 0, /*tp_setattro*/
7885 0, /*tp_as_buffer*/
7886 Py_TPFLAGS_DEFAULT, /*tp_flags*/
7887 0, /*tp_doc*/
7888 0, /*tp_traverse*/
7889 0, /*tp_clear*/
7890 0, /*tp_richcompare*/
7891 0, /*tp_weaklistoffset*/
7892 0, /*tp_iter*/
7893 0, /*tp_iternext*/
7894 encoding_map_methods, /*tp_methods*/
7895 0, /*tp_members*/
7896 0, /*tp_getset*/
7897 0, /*tp_base*/
7898 0, /*tp_dict*/
7899 0, /*tp_descr_get*/
7900 0, /*tp_descr_set*/
7901 0, /*tp_dictoffset*/
7902 0, /*tp_init*/
7903 0, /*tp_alloc*/
7904 0, /*tp_new*/
7905 0, /*tp_free*/
7906 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007907};
7908
7909PyObject*
7910PyUnicode_BuildEncodingMap(PyObject* string)
7911{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007912 PyObject *result;
7913 struct encoding_map *mresult;
7914 int i;
7915 int need_dict = 0;
7916 unsigned char level1[32];
7917 unsigned char level2[512];
7918 unsigned char *mlevel1, *mlevel2, *mlevel3;
7919 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007920 int kind;
7921 void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007922 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007923 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007924
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007925 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007926 PyErr_BadArgument();
7927 return NULL;
7928 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007929 kind = PyUnicode_KIND(string);
7930 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007931 length = PyUnicode_GET_LENGTH(string);
7932 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007933 memset(level1, 0xFF, sizeof level1);
7934 memset(level2, 0xFF, sizeof level2);
7935
7936 /* If there isn't a one-to-one mapping of NULL to \0,
7937 or if there are non-BMP characters, we need to use
7938 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007939 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007940 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007941 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007942 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007943 ch = PyUnicode_READ(kind, data, i);
7944 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007945 need_dict = 1;
7946 break;
7947 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007948 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007949 /* unmapped character */
7950 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007951 l1 = ch >> 11;
7952 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007953 if (level1[l1] == 0xFF)
7954 level1[l1] = count2++;
7955 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00007956 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007957 }
7958
7959 if (count2 >= 0xFF || count3 >= 0xFF)
7960 need_dict = 1;
7961
7962 if (need_dict) {
7963 PyObject *result = PyDict_New();
7964 PyObject *key, *value;
7965 if (!result)
7966 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007967 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007968 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00007969 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007970 if (!key || !value)
7971 goto failed1;
7972 if (PyDict_SetItem(result, key, value) == -1)
7973 goto failed1;
7974 Py_DECREF(key);
7975 Py_DECREF(value);
7976 }
7977 return result;
7978 failed1:
7979 Py_XDECREF(key);
7980 Py_XDECREF(value);
7981 Py_DECREF(result);
7982 return NULL;
7983 }
7984
7985 /* Create a three-level trie */
7986 result = PyObject_MALLOC(sizeof(struct encoding_map) +
7987 16*count2 + 128*count3 - 1);
7988 if (!result)
7989 return PyErr_NoMemory();
7990 PyObject_Init(result, &EncodingMapType);
7991 mresult = (struct encoding_map*)result;
7992 mresult->count2 = count2;
7993 mresult->count3 = count3;
7994 mlevel1 = mresult->level1;
7995 mlevel2 = mresult->level23;
7996 mlevel3 = mresult->level23 + 16*count2;
7997 memcpy(mlevel1, level1, 32);
7998 memset(mlevel2, 0xFF, 16*count2);
7999 memset(mlevel3, 0, 128*count3);
8000 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008001 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008002 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008003 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8004 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008005 /* unmapped character */
8006 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008007 o1 = ch>>11;
8008 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008009 i2 = 16*mlevel1[o1] + o2;
8010 if (mlevel2[i2] == 0xFF)
8011 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008012 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008013 i3 = 128*mlevel2[i2] + o3;
8014 mlevel3[i3] = i;
8015 }
8016 return result;
8017}
8018
8019static int
Victor Stinner22168992011-11-20 17:09:18 +01008020encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008021{
8022 struct encoding_map *map = (struct encoding_map*)mapping;
8023 int l1 = c>>11;
8024 int l2 = (c>>7) & 0xF;
8025 int l3 = c & 0x7F;
8026 int i;
8027
Victor Stinner22168992011-11-20 17:09:18 +01008028 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00008029 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008030 if (c == 0)
8031 return 0;
8032 /* level 1*/
8033 i = map->level1[l1];
8034 if (i == 0xFF) {
8035 return -1;
8036 }
8037 /* level 2*/
8038 i = map->level23[16*i+l2];
8039 if (i == 0xFF) {
8040 return -1;
8041 }
8042 /* level 3 */
8043 i = map->level23[16*map->count2 + 128*i + l3];
8044 if (i == 0) {
8045 return -1;
8046 }
8047 return i;
8048}
8049
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008050/* Lookup the character ch in the mapping. If the character
8051 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00008052 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008053static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01008054charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008055{
Christian Heimes217cfd12007-12-02 14:31:20 +00008056 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008057 PyObject *x;
8058
8059 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008060 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008061 x = PyObject_GetItem(mapping, w);
8062 Py_DECREF(w);
8063 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008064 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8065 /* No mapping found means: mapping is undefined. */
8066 PyErr_Clear();
8067 x = Py_None;
8068 Py_INCREF(x);
8069 return x;
8070 } else
8071 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008072 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008073 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008074 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008075 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008076 long value = PyLong_AS_LONG(x);
8077 if (value < 0 || value > 255) {
8078 PyErr_SetString(PyExc_TypeError,
8079 "character mapping must be in range(256)");
8080 Py_DECREF(x);
8081 return NULL;
8082 }
8083 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008084 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008085 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008086 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008087 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008088 /* wrong return value */
8089 PyErr_Format(PyExc_TypeError,
8090 "character mapping must return integer, bytes or None, not %.400s",
8091 x->ob_type->tp_name);
8092 Py_DECREF(x);
8093 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008094 }
8095}
8096
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008097static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008098charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008099{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008100 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8101 /* exponentially overallocate to minimize reallocations */
8102 if (requiredsize < 2*outsize)
8103 requiredsize = 2*outsize;
8104 if (_PyBytes_Resize(outobj, requiredsize))
8105 return -1;
8106 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008107}
8108
Benjamin Peterson14339b62009-01-31 16:36:08 +00008109typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008110 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008111} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008112/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008113 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008114 space is available. Return a new reference to the object that
8115 was put in the output buffer, or Py_None, if the mapping was undefined
8116 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008117 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008118static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008119charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008120 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008121{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008122 PyObject *rep;
8123 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008124 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008125
Christian Heimes90aa7642007-12-19 02:45:37 +00008126 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008127 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008128 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008129 if (res == -1)
8130 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008131 if (outsize<requiredsize)
8132 if (charmapencode_resize(outobj, outpos, requiredsize))
8133 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008134 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008135 outstart[(*outpos)++] = (char)res;
8136 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008137 }
8138
8139 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008140 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008141 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008142 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008143 Py_DECREF(rep);
8144 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008145 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008146 if (PyLong_Check(rep)) {
8147 Py_ssize_t requiredsize = *outpos+1;
8148 if (outsize<requiredsize)
8149 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8150 Py_DECREF(rep);
8151 return enc_EXCEPTION;
8152 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008153 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008154 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008155 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008156 else {
8157 const char *repchars = PyBytes_AS_STRING(rep);
8158 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8159 Py_ssize_t requiredsize = *outpos+repsize;
8160 if (outsize<requiredsize)
8161 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8162 Py_DECREF(rep);
8163 return enc_EXCEPTION;
8164 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008165 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008166 memcpy(outstart + *outpos, repchars, repsize);
8167 *outpos += repsize;
8168 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008169 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008170 Py_DECREF(rep);
8171 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008172}
8173
8174/* handle an error in PyUnicode_EncodeCharmap
8175 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008176static int
8177charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008178 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008179 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00008180 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008181 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008182{
8183 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008184 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008185 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008186 enum PyUnicode_Kind kind;
8187 void *data;
8188 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008189 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008190 Py_ssize_t collstartpos = *inpos;
8191 Py_ssize_t collendpos = *inpos+1;
8192 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008193 char *encoding = "charmap";
8194 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008195 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008196 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008197 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008198
Benjamin Petersonbac79492012-01-14 13:34:47 -05008199 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008200 return -1;
8201 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008202 /* find all unencodable characters */
8203 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008204 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008205 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008206 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008207 val = encoding_map_lookup(ch, mapping);
8208 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008209 break;
8210 ++collendpos;
8211 continue;
8212 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008213
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008214 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8215 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008216 if (rep==NULL)
8217 return -1;
8218 else if (rep!=Py_None) {
8219 Py_DECREF(rep);
8220 break;
8221 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008222 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008223 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008224 }
8225 /* cache callback name lookup
8226 * (if not done yet, i.e. it's the first error) */
8227 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008228 if ((errors==NULL) || (!strcmp(errors, "strict")))
8229 *known_errorHandler = 1;
8230 else if (!strcmp(errors, "replace"))
8231 *known_errorHandler = 2;
8232 else if (!strcmp(errors, "ignore"))
8233 *known_errorHandler = 3;
8234 else if (!strcmp(errors, "xmlcharrefreplace"))
8235 *known_errorHandler = 4;
8236 else
8237 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008238 }
8239 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008240 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008241 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008242 return -1;
8243 case 2: /* replace */
8244 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008245 x = charmapencode_output('?', mapping, res, respos);
8246 if (x==enc_EXCEPTION) {
8247 return -1;
8248 }
8249 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008250 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008251 return -1;
8252 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008253 }
8254 /* fall through */
8255 case 3: /* ignore */
8256 *inpos = collendpos;
8257 break;
8258 case 4: /* xmlcharrefreplace */
8259 /* generate replacement (temporarily (mis)uses p) */
8260 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008261 char buffer[2+29+1+1];
8262 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008263 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008264 for (cp = buffer; *cp; ++cp) {
8265 x = charmapencode_output(*cp, mapping, res, respos);
8266 if (x==enc_EXCEPTION)
8267 return -1;
8268 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008269 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008270 return -1;
8271 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008272 }
8273 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008274 *inpos = collendpos;
8275 break;
8276 default:
8277 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008278 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008279 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008280 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008281 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008282 if (PyBytes_Check(repunicode)) {
8283 /* Directly copy bytes result to output. */
8284 Py_ssize_t outsize = PyBytes_Size(*res);
8285 Py_ssize_t requiredsize;
8286 repsize = PyBytes_Size(repunicode);
8287 requiredsize = *respos + repsize;
8288 if (requiredsize > outsize)
8289 /* Make room for all additional bytes. */
8290 if (charmapencode_resize(res, respos, requiredsize)) {
8291 Py_DECREF(repunicode);
8292 return -1;
8293 }
8294 memcpy(PyBytes_AsString(*res) + *respos,
8295 PyBytes_AsString(repunicode), repsize);
8296 *respos += repsize;
8297 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008298 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008299 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008300 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008301 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008302 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008303 Py_DECREF(repunicode);
8304 return -1;
8305 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008306 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008307 data = PyUnicode_DATA(repunicode);
8308 kind = PyUnicode_KIND(repunicode);
8309 for (index = 0; index < repsize; index++) {
8310 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8311 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008312 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008313 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008314 return -1;
8315 }
8316 else if (x==enc_FAILED) {
8317 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008318 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008319 return -1;
8320 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008321 }
8322 *inpos = newpos;
8323 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008324 }
8325 return 0;
8326}
8327
Alexander Belopolsky40018472011-02-26 01:02:56 +00008328PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008329_PyUnicode_EncodeCharmap(PyObject *unicode,
8330 PyObject *mapping,
8331 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008332{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008333 /* output object */
8334 PyObject *res = NULL;
8335 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008336 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008337 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008338 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008339 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008340 PyObject *errorHandler = NULL;
8341 PyObject *exc = NULL;
8342 /* the following variable is used for caching string comparisons
8343 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8344 * 3=ignore, 4=xmlcharrefreplace */
8345 int known_errorHandler = -1;
Victor Stinner69ed0f42013-04-09 21:48:24 +02008346 void *data;
8347 int kind;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008348
Benjamin Petersonbac79492012-01-14 13:34:47 -05008349 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008350 return NULL;
8351 size = PyUnicode_GET_LENGTH(unicode);
Victor Stinner69ed0f42013-04-09 21:48:24 +02008352 data = PyUnicode_DATA(unicode);
8353 kind = PyUnicode_KIND(unicode);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008354
Guido van Rossumd57fd912000-03-10 22:53:23 +00008355 /* Default to Latin-1 */
8356 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008357 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008358
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008359 /* allocate enough for a simple encoding without
8360 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008361 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008362 if (res == NULL)
8363 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008364 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008365 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008366
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008367 while (inpos<size) {
Victor Stinner69ed0f42013-04-09 21:48:24 +02008368 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008369 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008370 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008371 if (x==enc_EXCEPTION) /* error */
8372 goto onError;
8373 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008374 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008375 &exc,
8376 &known_errorHandler, &errorHandler, errors,
8377 &res, &respos)) {
8378 goto onError;
8379 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008380 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008381 else
8382 /* done with this character => adjust input position */
8383 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008384 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008385
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008386 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008387 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008388 if (_PyBytes_Resize(&res, respos) < 0)
8389 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008390
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008391 Py_XDECREF(exc);
8392 Py_XDECREF(errorHandler);
8393 return res;
8394
Benjamin Peterson29060642009-01-31 22:14:21 +00008395 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008396 Py_XDECREF(res);
8397 Py_XDECREF(exc);
8398 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008399 return NULL;
8400}
8401
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008402/* Deprecated */
8403PyObject *
8404PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8405 Py_ssize_t size,
8406 PyObject *mapping,
8407 const char *errors)
8408{
8409 PyObject *result;
8410 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8411 if (unicode == NULL)
8412 return NULL;
8413 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8414 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008415 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008416}
8417
Alexander Belopolsky40018472011-02-26 01:02:56 +00008418PyObject *
8419PyUnicode_AsCharmapString(PyObject *unicode,
8420 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008421{
8422 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008423 PyErr_BadArgument();
8424 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008425 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008426 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008427}
8428
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008429/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008430static void
8431make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008432 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008433 Py_ssize_t startpos, Py_ssize_t endpos,
8434 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008435{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008436 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008437 *exceptionObject = _PyUnicodeTranslateError_Create(
8438 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008439 }
8440 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008441 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8442 goto onError;
8443 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8444 goto onError;
8445 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8446 goto onError;
8447 return;
8448 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02008449 Py_CLEAR(*exceptionObject);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008450 }
8451}
8452
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008453/* error handling callback helper:
8454 build arguments, call the callback and check the arguments,
8455 put the result into newpos and return the replacement string, which
8456 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008457static PyObject *
8458unicode_translate_call_errorhandler(const char *errors,
8459 PyObject **errorHandler,
8460 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008461 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008462 Py_ssize_t startpos, Py_ssize_t endpos,
8463 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008464{
Benjamin Peterson142957c2008-07-04 19:55:29 +00008465 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008466
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008467 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008468 PyObject *restuple;
8469 PyObject *resunicode;
8470
8471 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008472 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008473 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008474 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008475 }
8476
8477 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008478 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008479 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008480 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008481
8482 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00008483 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008484 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008485 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008486 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00008487 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008488 Py_DECREF(restuple);
8489 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008490 }
8491 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00008492 &resunicode, &i_newpos)) {
8493 Py_DECREF(restuple);
8494 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008495 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008496 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008497 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008498 else
8499 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008500 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnera33bce02014-07-04 22:47:46 +02008501 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008502 Py_DECREF(restuple);
8503 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008504 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008505 Py_INCREF(resunicode);
8506 Py_DECREF(restuple);
8507 return resunicode;
8508}
8509
8510/* Lookup the character ch in the mapping and put the result in result,
8511 which must be decrefed by the caller.
8512 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008513static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008514charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008515{
Christian Heimes217cfd12007-12-02 14:31:20 +00008516 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008517 PyObject *x;
8518
8519 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008520 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008521 x = PyObject_GetItem(mapping, w);
8522 Py_DECREF(w);
8523 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008524 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8525 /* No mapping found means: use 1:1 mapping. */
8526 PyErr_Clear();
8527 *result = NULL;
8528 return 0;
8529 } else
8530 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008531 }
8532 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008533 *result = x;
8534 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008535 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008536 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008537 long value = PyLong_AS_LONG(x);
8538 long max = PyUnicode_GetMax();
8539 if (value < 0 || value > max) {
8540 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00008541 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008542 Py_DECREF(x);
8543 return -1;
8544 }
8545 *result = x;
8546 return 0;
8547 }
8548 else if (PyUnicode_Check(x)) {
8549 *result = x;
8550 return 0;
8551 }
8552 else {
8553 /* wrong return value */
8554 PyErr_SetString(PyExc_TypeError,
8555 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008556 Py_DECREF(x);
8557 return -1;
8558 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008559}
8560/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00008561 if not reallocate and adjust various state variables.
8562 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008563static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008564charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00008565 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008566{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008567 Py_ssize_t oldsize = *psize;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +00008568 Py_UCS4 *new_outobj;
Walter Dörwald4894c302003-10-24 14:25:28 +00008569 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008570 /* exponentially overallocate to minimize reallocations */
8571 if (requiredsize < 2 * oldsize)
8572 requiredsize = 2 * oldsize;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +00008573 new_outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
8574 if (new_outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008575 return -1;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +00008576 *outobj = new_outobj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008577 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008578 }
8579 return 0;
8580}
8581/* lookup the character, put the result in the output string and adjust
8582 various state variables. Return a new reference to the object that
8583 was put in the output buffer in *result, or Py_None, if the mapping was
8584 undefined (in which case no character was written).
8585 The called must decref result.
8586 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008587static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008588charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
8589 PyObject *mapping, Py_UCS4 **output,
8590 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008591 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008592{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008593 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
8594 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00008595 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008596 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008597 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008598 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008599 }
8600 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008601 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00008602 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008603 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008604 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008605 }
8606 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008607 Py_ssize_t repsize;
8608 if (PyUnicode_READY(*res) == -1)
8609 return -1;
8610 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00008611 if (repsize==1) {
8612 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008613 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00008614 }
8615 else if (repsize!=0) {
8616 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008617 Py_ssize_t requiredsize = *opos +
8618 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00008619 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008620 Py_ssize_t i;
8621 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00008622 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008623 for(i = 0; i < repsize; i++)
8624 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00008625 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008626 }
8627 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008628 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008629 return 0;
8630}
8631
Alexander Belopolsky40018472011-02-26 01:02:56 +00008632PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008633_PyUnicode_TranslateCharmap(PyObject *input,
8634 PyObject *mapping,
8635 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008636{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008637 /* input object */
8638 char *idata;
8639 Py_ssize_t size, i;
8640 int kind;
8641 /* output buffer */
8642 Py_UCS4 *output = NULL;
8643 Py_ssize_t osize;
8644 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008645 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008646 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008647 char *reason = "character maps to <undefined>";
8648 PyObject *errorHandler = NULL;
8649 PyObject *exc = NULL;
8650 /* the following variable is used for caching string comparisons
8651 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8652 * 3=ignore, 4=xmlcharrefreplace */
8653 int known_errorHandler = -1;
8654
Guido van Rossumd57fd912000-03-10 22:53:23 +00008655 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008656 PyErr_BadArgument();
8657 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008658 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008659
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008660 if (PyUnicode_READY(input) == -1)
8661 return NULL;
8662 idata = (char*)PyUnicode_DATA(input);
8663 kind = PyUnicode_KIND(input);
8664 size = PyUnicode_GET_LENGTH(input);
8665 i = 0;
8666
8667 if (size == 0) {
8668 Py_INCREF(input);
8669 return input;
8670 }
8671
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008672 /* allocate enough for a simple 1:1 translation without
8673 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008674 osize = size;
Benjamin Petersone5a853c2015-03-02 13:23:25 -05008675 output = PyMem_NEW(Py_UCS4, osize);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008676 opos = 0;
8677 if (output == NULL) {
8678 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00008679 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008680 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008681
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008682 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008683 /* try to encode it */
8684 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008685 if (charmaptranslate_output(input, i, mapping,
8686 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008687 Py_XDECREF(x);
8688 goto onError;
8689 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008690 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00008691 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008692 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00008693 else { /* untranslatable character */
8694 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8695 Py_ssize_t repsize;
8696 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008697 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00008698 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008699 Py_ssize_t collstart = i;
8700 Py_ssize_t collend = i+1;
8701 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008702
Benjamin Peterson29060642009-01-31 22:14:21 +00008703 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008704 while (collend < size) {
8705 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008706 goto onError;
8707 Py_XDECREF(x);
8708 if (x!=Py_None)
8709 break;
8710 ++collend;
8711 }
8712 /* cache callback name lookup
8713 * (if not done yet, i.e. it's the first error) */
8714 if (known_errorHandler==-1) {
8715 if ((errors==NULL) || (!strcmp(errors, "strict")))
8716 known_errorHandler = 1;
8717 else if (!strcmp(errors, "replace"))
8718 known_errorHandler = 2;
8719 else if (!strcmp(errors, "ignore"))
8720 known_errorHandler = 3;
8721 else if (!strcmp(errors, "xmlcharrefreplace"))
8722 known_errorHandler = 4;
8723 else
8724 known_errorHandler = 0;
8725 }
8726 switch (known_errorHandler) {
8727 case 1: /* strict */
Victor Stinner6fa62752012-10-23 02:51:50 +02008728 make_translate_exception(&exc,
8729 input, collstart, collend, reason);
8730 if (exc != NULL)
8731 PyCodec_StrictErrors(exc);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008732 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00008733 case 2: /* replace */
8734 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008735 for (coll = collstart; coll<collend; coll++)
8736 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00008737 /* fall through */
8738 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008739 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008740 break;
8741 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008742 /* generate replacement (temporarily (mis)uses i) */
8743 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008744 char buffer[2+29+1+1];
8745 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008746 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
8747 if (charmaptranslate_makespace(&output, &osize,
8748 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00008749 goto onError;
8750 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008751 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00008752 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008753 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008754 break;
8755 default:
8756 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008757 reason, input, &exc,
8758 collstart, collend, &newpos);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008759 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008760 goto onError;
Benjamin Peterson9ca3ffa2012-01-01 16:04:29 -06008761 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008762 Py_DECREF(repunicode);
8763 goto onError;
8764 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008765 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008766 repsize = PyUnicode_GET_LENGTH(repunicode);
8767 if (charmaptranslate_makespace(&output, &osize,
8768 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008769 Py_DECREF(repunicode);
8770 goto onError;
8771 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008772 for (uni2 = 0; repsize-->0; ++uni2)
8773 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
8774 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00008775 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008776 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008777 }
8778 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008779 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
8780 if (!res)
8781 goto onError;
8782 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008783 Py_XDECREF(exc);
8784 Py_XDECREF(errorHandler);
8785 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008786
Benjamin Peterson29060642009-01-31 22:14:21 +00008787 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008788 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008789 Py_XDECREF(exc);
8790 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008791 return NULL;
8792}
8793
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008794/* Deprecated. Use PyUnicode_Translate instead. */
8795PyObject *
8796PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8797 Py_ssize_t size,
8798 PyObject *mapping,
8799 const char *errors)
8800{
Christian Heimes5f520f42012-09-11 14:03:25 +02008801 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008802 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8803 if (!unicode)
8804 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02008805 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8806 Py_DECREF(unicode);
8807 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008808}
8809
Alexander Belopolsky40018472011-02-26 01:02:56 +00008810PyObject *
8811PyUnicode_Translate(PyObject *str,
8812 PyObject *mapping,
8813 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008814{
8815 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008816
Guido van Rossumd57fd912000-03-10 22:53:23 +00008817 str = PyUnicode_FromObject(str);
8818 if (str == NULL)
Christian Heimes5f520f42012-09-11 14:03:25 +02008819 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008820 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008821 Py_DECREF(str);
8822 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008823}
Tim Petersced69f82003-09-16 20:30:58 +00008824
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008825static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008826fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008827{
8828 /* No need to call PyUnicode_READY(self) because this function is only
8829 called as a callback from fixup() which does it already. */
8830 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8831 const int kind = PyUnicode_KIND(self);
8832 void *data = PyUnicode_DATA(self);
Victor Stinnere6abb482012-05-02 01:15:40 +02008833 Py_UCS4 maxchar = 127, ch, fixed;
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008834 int modified = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008835 Py_ssize_t i;
8836
8837 for (i = 0; i < len; ++i) {
8838 ch = PyUnicode_READ(kind, data, i);
8839 fixed = 0;
8840 if (ch > 127) {
8841 if (Py_UNICODE_ISSPACE(ch))
8842 fixed = ' ';
8843 else {
8844 const int decimal = Py_UNICODE_TODECIMAL(ch);
8845 if (decimal >= 0)
8846 fixed = '0' + decimal;
8847 }
8848 if (fixed != 0) {
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008849 modified = 1;
Benjamin Peterson7e303732013-06-10 09:19:46 -07008850 maxchar = Py_MAX(maxchar, fixed);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008851 PyUnicode_WRITE(kind, data, i, fixed);
8852 }
Victor Stinnere6abb482012-05-02 01:15:40 +02008853 else
Benjamin Peterson7e303732013-06-10 09:19:46 -07008854 maxchar = Py_MAX(maxchar, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008855 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008856 }
8857
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008858 return (modified) ? maxchar : 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008859}
8860
8861PyObject *
8862_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8863{
8864 if (!PyUnicode_Check(unicode)) {
8865 PyErr_BadInternalCall();
8866 return NULL;
8867 }
8868 if (PyUnicode_READY(unicode) == -1)
8869 return NULL;
8870 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8871 /* If the string is already ASCII, just return the same string */
8872 Py_INCREF(unicode);
8873 return unicode;
8874 }
Victor Stinner9310abb2011-10-05 00:59:23 +02008875 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008876}
8877
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008878PyObject *
8879PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8880 Py_ssize_t length)
8881{
Victor Stinnerf0124502011-11-21 23:12:56 +01008882 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008883 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01008884 Py_UCS4 maxchar;
8885 enum PyUnicode_Kind kind;
8886 void *data;
8887
Victor Stinner99d7ad02012-02-22 13:37:39 +01008888 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008889 for (i = 0; i < length; i++) {
Victor Stinnerf0124502011-11-21 23:12:56 +01008890 Py_UNICODE ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008891 if (ch > 127) {
8892 int decimal = Py_UNICODE_TODECIMAL(ch);
8893 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01008894 ch = '0' + decimal;
Benjamin Peterson7e303732013-06-10 09:19:46 -07008895 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008896 }
8897 }
Victor Stinnerf0124502011-11-21 23:12:56 +01008898
8899 /* Copy to a new string */
8900 decimal = PyUnicode_New(length, maxchar);
8901 if (decimal == NULL)
8902 return decimal;
8903 kind = PyUnicode_KIND(decimal);
8904 data = PyUnicode_DATA(decimal);
8905 /* Iterate over code points */
8906 for (i = 0; i < length; i++) {
8907 Py_UNICODE ch = s[i];
8908 if (ch > 127) {
8909 int decimal = Py_UNICODE_TODECIMAL(ch);
8910 if (decimal >= 0)
8911 ch = '0' + decimal;
8912 }
8913 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008914 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008915 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008916}
Guido van Rossum9e896b32000-04-05 20:11:21 +00008917/* --- Decimal Encoder ---------------------------------------------------- */
8918
Alexander Belopolsky40018472011-02-26 01:02:56 +00008919int
8920PyUnicode_EncodeDecimal(Py_UNICODE *s,
8921 Py_ssize_t length,
8922 char *output,
8923 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00008924{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008925 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01008926 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01008927 enum PyUnicode_Kind kind;
8928 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008929
8930 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008931 PyErr_BadArgument();
8932 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008933 }
8934
Victor Stinner42bf7752011-11-21 22:52:58 +01008935 unicode = PyUnicode_FromUnicode(s, length);
8936 if (unicode == NULL)
8937 return -1;
8938
Benjamin Petersonbac79492012-01-14 13:34:47 -05008939 if (PyUnicode_READY(unicode) == -1) {
Victor Stinner6345be92011-11-25 20:09:01 +01008940 Py_DECREF(unicode);
8941 return -1;
8942 }
Victor Stinner42bf7752011-11-21 22:52:58 +01008943 kind = PyUnicode_KIND(unicode);
8944 data = PyUnicode_DATA(unicode);
8945
Victor Stinnerb84d7232011-11-22 01:50:07 +01008946 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01008947 PyObject *exc;
8948 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00008949 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01008950 Py_ssize_t startpos;
8951
8952 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00008953
Benjamin Peterson29060642009-01-31 22:14:21 +00008954 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008955 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01008956 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008957 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008958 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008959 decimal = Py_UNICODE_TODECIMAL(ch);
8960 if (decimal >= 0) {
8961 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008962 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008963 continue;
8964 }
8965 if (0 < ch && ch < 256) {
8966 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008967 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008968 continue;
8969 }
Victor Stinner6345be92011-11-25 20:09:01 +01008970
Victor Stinner42bf7752011-11-21 22:52:58 +01008971 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01008972 exc = NULL;
8973 raise_encode_exception(&exc, "decimal", unicode,
8974 startpos, startpos+1,
8975 "invalid decimal Unicode string");
8976 Py_XDECREF(exc);
8977 Py_DECREF(unicode);
8978 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008979 }
8980 /* 0-terminate the output string */
8981 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01008982 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008983 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008984}
8985
Guido van Rossumd57fd912000-03-10 22:53:23 +00008986/* --- Helpers ------------------------------------------------------------ */
8987
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008988static Py_ssize_t
Victor Stinner794d5672011-10-10 03:21:36 +02008989any_find_slice(int direction, PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008990 Py_ssize_t start,
8991 Py_ssize_t end)
8992{
8993 int kind1, kind2, kind;
8994 void *buf1, *buf2;
8995 Py_ssize_t len1, len2, result;
8996
8997 kind1 = PyUnicode_KIND(s1);
8998 kind2 = PyUnicode_KIND(s2);
8999 kind = kind1 > kind2 ? kind1 : kind2;
9000 buf1 = PyUnicode_DATA(s1);
9001 buf2 = PyUnicode_DATA(s2);
9002 if (kind1 != kind)
9003 buf1 = _PyUnicode_AsKind(s1, kind);
9004 if (!buf1)
9005 return -2;
9006 if (kind2 != kind)
9007 buf2 = _PyUnicode_AsKind(s2, kind);
9008 if (!buf2) {
9009 if (kind1 != kind) PyMem_Free(buf1);
9010 return -2;
9011 }
9012 len1 = PyUnicode_GET_LENGTH(s1);
9013 len2 = PyUnicode_GET_LENGTH(s2);
9014
Victor Stinner794d5672011-10-10 03:21:36 +02009015 if (direction > 0) {
Benjamin Petersonead6b532011-12-20 17:23:42 -06009016 switch (kind) {
Victor Stinner794d5672011-10-10 03:21:36 +02009017 case PyUnicode_1BYTE_KIND:
9018 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9019 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9020 else
9021 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9022 break;
9023 case PyUnicode_2BYTE_KIND:
9024 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9025 break;
9026 case PyUnicode_4BYTE_KIND:
9027 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9028 break;
9029 default:
9030 assert(0); result = -2;
9031 }
9032 }
9033 else {
Benjamin Petersonead6b532011-12-20 17:23:42 -06009034 switch (kind) {
Victor Stinner794d5672011-10-10 03:21:36 +02009035 case PyUnicode_1BYTE_KIND:
9036 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9037 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9038 else
9039 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9040 break;
9041 case PyUnicode_2BYTE_KIND:
9042 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9043 break;
9044 case PyUnicode_4BYTE_KIND:
9045 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9046 break;
9047 default:
9048 assert(0); result = -2;
9049 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009050 }
9051
9052 if (kind1 != kind)
9053 PyMem_Free(buf1);
9054 if (kind2 != kind)
9055 PyMem_Free(buf2);
9056
9057 return result;
9058}
9059
9060Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01009061_PyUnicode_InsertThousandsGrouping(
9062 PyObject *unicode, Py_ssize_t index,
9063 Py_ssize_t n_buffer,
9064 void *digits, Py_ssize_t n_digits,
9065 Py_ssize_t min_width,
9066 const char *grouping, PyObject *thousands_sep,
9067 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009068{
Victor Stinner41a863c2012-02-24 00:37:51 +01009069 unsigned int kind, thousands_sep_kind;
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009070 char *data, *thousands_sep_data;
Victor Stinner41a863c2012-02-24 00:37:51 +01009071 Py_ssize_t thousands_sep_len;
9072 Py_ssize_t len;
9073
9074 if (unicode != NULL) {
9075 kind = PyUnicode_KIND(unicode);
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009076 data = (char *) PyUnicode_DATA(unicode) + index * kind;
Victor Stinner41a863c2012-02-24 00:37:51 +01009077 }
9078 else {
9079 kind = PyUnicode_1BYTE_KIND;
9080 data = NULL;
9081 }
9082 thousands_sep_kind = PyUnicode_KIND(thousands_sep);
9083 thousands_sep_data = PyUnicode_DATA(thousands_sep);
9084 thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9085 if (unicode != NULL && thousands_sep_kind != kind) {
Victor Stinner90f50d42012-02-24 01:44:47 +01009086 if (thousands_sep_kind < kind) {
9087 thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind);
9088 if (!thousands_sep_data)
9089 return -1;
9090 }
9091 else {
9092 data = _PyUnicode_AsKind(unicode, thousands_sep_kind);
9093 if (!data)
9094 return -1;
9095 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009096 }
9097
Benjamin Petersonead6b532011-12-20 17:23:42 -06009098 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009099 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009100 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
Victor Stinner41a863c2012-02-24 00:37:51 +01009101 len = asciilib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009102 (Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009103 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009104 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009105 else
Victor Stinner41a863c2012-02-24 00:37:51 +01009106 len = ucs1lib_InsertThousandsGrouping(
Victor Stinnerc3cec782011-10-05 21:24:08 +02009107 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009108 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009109 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009110 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009111 case PyUnicode_2BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009112 len = ucs2lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009113 (Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009114 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009115 (Py_UCS2 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009116 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009117 case PyUnicode_4BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009118 len = ucs4lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009119 (Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009120 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009121 (Py_UCS4 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009122 break;
9123 default:
9124 assert(0);
9125 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009126 }
Victor Stinner90f50d42012-02-24 01:44:47 +01009127 if (unicode != NULL && thousands_sep_kind != kind) {
9128 if (thousands_sep_kind < kind)
9129 PyMem_Free(thousands_sep_data);
9130 else
9131 PyMem_Free(data);
9132 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009133 if (unicode == NULL) {
9134 *maxchar = 127;
9135 if (len != n_digits) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009136 *maxchar = Py_MAX(*maxchar,
Victor Stinnere6abb482012-05-02 01:15:40 +02009137 PyUnicode_MAX_CHAR_VALUE(thousands_sep));
Victor Stinner41a863c2012-02-24 00:37:51 +01009138 }
9139 }
9140 return len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009141}
9142
9143
Thomas Wouters477c8d52006-05-27 19:21:47 +00009144/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009145#define ADJUST_INDICES(start, end, len) \
9146 if (end > len) \
9147 end = len; \
9148 else if (end < 0) { \
9149 end += len; \
9150 if (end < 0) \
9151 end = 0; \
9152 } \
9153 if (start < 0) { \
9154 start += len; \
9155 if (start < 0) \
9156 start = 0; \
9157 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009158
Alexander Belopolsky40018472011-02-26 01:02:56 +00009159Py_ssize_t
9160PyUnicode_Count(PyObject *str,
9161 PyObject *substr,
9162 Py_ssize_t start,
9163 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009164{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009165 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009166 PyObject* str_obj;
9167 PyObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009168 int kind1, kind2, kind;
9169 void *buf1 = NULL, *buf2 = NULL;
9170 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009171
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009172 str_obj = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009173 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00009174 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009175 sub_obj = PyUnicode_FromObject(substr);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009176 if (!sub_obj) {
9177 Py_DECREF(str_obj);
9178 return -1;
9179 }
Benjamin Peterson4c13a4a2012-01-02 09:07:38 -06009180 if (PyUnicode_READY(sub_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
Benjamin Peterson5e458f52012-01-02 10:12:13 -06009181 Py_DECREF(sub_obj);
Benjamin Peterson29060642009-01-31 22:14:21 +00009182 Py_DECREF(str_obj);
9183 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009184 }
Tim Petersced69f82003-09-16 20:30:58 +00009185
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009186 kind1 = PyUnicode_KIND(str_obj);
9187 kind2 = PyUnicode_KIND(sub_obj);
Antoine Pitroue45c0c52012-05-12 15:49:07 +02009188 kind = kind1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009189 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009190 buf2 = PyUnicode_DATA(sub_obj);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -05009191 if (kind2 != kind) {
Antoine Pitrou758153b2012-05-12 15:51:51 +02009192 if (kind2 > kind) {
9193 Py_DECREF(sub_obj);
9194 Py_DECREF(str_obj);
Antoine Pitroue45c0c52012-05-12 15:49:07 +02009195 return 0;
Antoine Pitrou758153b2012-05-12 15:51:51 +02009196 }
Victor Stinner7931d9a2011-11-04 00:22:48 +01009197 buf2 = _PyUnicode_AsKind(sub_obj, kind);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -05009198 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009199 if (!buf2)
9200 goto onError;
9201 len1 = PyUnicode_GET_LENGTH(str_obj);
9202 len2 = PyUnicode_GET_LENGTH(sub_obj);
9203
9204 ADJUST_INDICES(start, end, len1);
Benjamin Petersonead6b532011-12-20 17:23:42 -06009205 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009206 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009207 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
9208 result = asciilib_count(
9209 ((Py_UCS1*)buf1) + start, end - start,
9210 buf2, len2, PY_SSIZE_T_MAX
9211 );
9212 else
9213 result = ucs1lib_count(
9214 ((Py_UCS1*)buf1) + start, end - start,
9215 buf2, len2, PY_SSIZE_T_MAX
9216 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009217 break;
9218 case PyUnicode_2BYTE_KIND:
9219 result = ucs2lib_count(
9220 ((Py_UCS2*)buf1) + start, end - start,
9221 buf2, len2, PY_SSIZE_T_MAX
9222 );
9223 break;
9224 case PyUnicode_4BYTE_KIND:
9225 result = ucs4lib_count(
9226 ((Py_UCS4*)buf1) + start, end - start,
9227 buf2, len2, PY_SSIZE_T_MAX
9228 );
9229 break;
9230 default:
9231 assert(0); result = 0;
9232 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009233
9234 Py_DECREF(sub_obj);
9235 Py_DECREF(str_obj);
9236
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009237 if (kind2 != kind)
9238 PyMem_Free(buf2);
9239
Guido van Rossumd57fd912000-03-10 22:53:23 +00009240 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009241 onError:
9242 Py_DECREF(sub_obj);
9243 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009244 if (kind2 != kind && buf2)
9245 PyMem_Free(buf2);
9246 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009247}
9248
Alexander Belopolsky40018472011-02-26 01:02:56 +00009249Py_ssize_t
9250PyUnicode_Find(PyObject *str,
9251 PyObject *sub,
9252 Py_ssize_t start,
9253 Py_ssize_t end,
9254 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009255{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009256 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009257
Guido van Rossumd57fd912000-03-10 22:53:23 +00009258 str = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009259 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00009260 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009261 sub = PyUnicode_FromObject(sub);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009262 if (!sub) {
9263 Py_DECREF(str);
9264 return -2;
9265 }
9266 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
9267 Py_DECREF(sub);
Benjamin Peterson29060642009-01-31 22:14:21 +00009268 Py_DECREF(str);
9269 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009270 }
Tim Petersced69f82003-09-16 20:30:58 +00009271
Victor Stinner794d5672011-10-10 03:21:36 +02009272 result = any_find_slice(direction,
9273 str, sub, start, end
9274 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00009275
Guido van Rossumd57fd912000-03-10 22:53:23 +00009276 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009277 Py_DECREF(sub);
9278
Guido van Rossumd57fd912000-03-10 22:53:23 +00009279 return result;
9280}
9281
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009282Py_ssize_t
9283PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9284 Py_ssize_t start, Py_ssize_t end,
9285 int direction)
9286{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009287 int kind;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009288 Py_ssize_t result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009289 if (PyUnicode_READY(str) == -1)
9290 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02009291 if (start < 0 || end < 0) {
9292 PyErr_SetString(PyExc_IndexError, "string index out of range");
9293 return -2;
9294 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009295 if (end > PyUnicode_GET_LENGTH(str))
9296 end = PyUnicode_GET_LENGTH(str);
9297 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009298 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9299 kind, end-start, ch, direction);
9300 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009301 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009302 else
9303 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009304}
9305
Alexander Belopolsky40018472011-02-26 01:02:56 +00009306static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009307tailmatch(PyObject *self,
9308 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009309 Py_ssize_t start,
9310 Py_ssize_t end,
9311 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009312{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009313 int kind_self;
9314 int kind_sub;
9315 void *data_self;
9316 void *data_sub;
9317 Py_ssize_t offset;
9318 Py_ssize_t i;
9319 Py_ssize_t end_sub;
9320
9321 if (PyUnicode_READY(self) == -1 ||
9322 PyUnicode_READY(substring) == -1)
Victor Stinner18aa4472013-01-03 03:18:09 +01009323 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009324
9325 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009326 return 1;
9327
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009328 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9329 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009330 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009331 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009332
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009333 kind_self = PyUnicode_KIND(self);
9334 data_self = PyUnicode_DATA(self);
9335 kind_sub = PyUnicode_KIND(substring);
9336 data_sub = PyUnicode_DATA(substring);
9337 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9338
9339 if (direction > 0)
9340 offset = end;
9341 else
9342 offset = start;
9343
9344 if (PyUnicode_READ(kind_self, data_self, offset) ==
9345 PyUnicode_READ(kind_sub, data_sub, 0) &&
9346 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9347 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9348 /* If both are of the same kind, memcmp is sufficient */
9349 if (kind_self == kind_sub) {
9350 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009351 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009352 data_sub,
9353 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009354 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009355 }
9356 /* otherwise we have to compare each character by first accesing it */
9357 else {
9358 /* We do not need to compare 0 and len(substring)-1 because
9359 the if statement above ensured already that they are equal
9360 when we end up here. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009361 for (i = 1; i < end_sub; ++i) {
9362 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9363 PyUnicode_READ(kind_sub, data_sub, i))
9364 return 0;
9365 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009366 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009367 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009368 }
9369
9370 return 0;
9371}
9372
Alexander Belopolsky40018472011-02-26 01:02:56 +00009373Py_ssize_t
9374PyUnicode_Tailmatch(PyObject *str,
9375 PyObject *substr,
9376 Py_ssize_t start,
9377 Py_ssize_t end,
9378 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009379{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009380 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009381
Guido van Rossumd57fd912000-03-10 22:53:23 +00009382 str = PyUnicode_FromObject(str);
9383 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009384 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009385 substr = PyUnicode_FromObject(substr);
9386 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009387 Py_DECREF(str);
9388 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009389 }
Tim Petersced69f82003-09-16 20:30:58 +00009390
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009391 result = tailmatch(str, substr,
Benjamin Peterson29060642009-01-31 22:14:21 +00009392 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009393 Py_DECREF(str);
9394 Py_DECREF(substr);
9395 return result;
9396}
9397
Guido van Rossumd57fd912000-03-10 22:53:23 +00009398/* Apply fixfct filter to the Unicode object self and return a
9399 reference to the modified object */
9400
Alexander Belopolsky40018472011-02-26 01:02:56 +00009401static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009402fixup(PyObject *self,
9403 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009404{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009405 PyObject *u;
9406 Py_UCS4 maxchar_old, maxchar_new = 0;
Victor Stinnereaab6042011-12-11 22:22:39 +01009407 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009408
Victor Stinnerbf6e5602011-12-12 01:53:47 +01009409 u = _PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009410 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009411 return NULL;
Victor Stinner87af4f22011-11-21 23:03:47 +01009412 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009413
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009414 /* fix functions return the new maximum character in a string,
9415 if the kind of the resulting unicode object does not change,
9416 everything is fine. Otherwise we need to change the string kind
9417 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009418 maxchar_new = fixfct(u);
Victor Stinnereaab6042011-12-11 22:22:39 +01009419
9420 if (maxchar_new == 0) {
9421 /* no changes */;
9422 if (PyUnicode_CheckExact(self)) {
9423 Py_DECREF(u);
9424 Py_INCREF(self);
9425 return self;
9426 }
9427 else
9428 return u;
9429 }
9430
Victor Stinnere6abb482012-05-02 01:15:40 +02009431 maxchar_new = align_maxchar(maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009432
Victor Stinnereaab6042011-12-11 22:22:39 +01009433 if (maxchar_new == maxchar_old)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009434 return u;
Victor Stinnereaab6042011-12-11 22:22:39 +01009435
9436 /* In case the maximum character changed, we need to
9437 convert the string to the new category. */
9438 v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
9439 if (v == NULL) {
9440 Py_DECREF(u);
9441 return NULL;
9442 }
9443 if (maxchar_new > maxchar_old) {
9444 /* If the maxchar increased so that the kind changed, not all
9445 characters are representable anymore and we need to fix the
9446 string again. This only happens in very few cases. */
Victor Stinnerd3f08822012-05-29 12:57:52 +02009447 _PyUnicode_FastCopyCharacters(v, 0,
9448 self, 0, PyUnicode_GET_LENGTH(self));
Victor Stinnereaab6042011-12-11 22:22:39 +01009449 maxchar_old = fixfct(v);
9450 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009451 }
9452 else {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009453 _PyUnicode_FastCopyCharacters(v, 0,
9454 u, 0, PyUnicode_GET_LENGTH(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009455 }
Victor Stinnereaab6042011-12-11 22:22:39 +01009456 Py_DECREF(u);
9457 assert(_PyUnicode_CheckConsistency(v, 1));
9458 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009459}
9460
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009461static PyObject *
9462ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009463{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009464 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9465 char *resdata, *data = PyUnicode_DATA(self);
9466 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009467
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009468 res = PyUnicode_New(len, 127);
9469 if (res == NULL)
9470 return NULL;
9471 resdata = PyUnicode_DATA(res);
9472 if (lower)
9473 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009474 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009475 _Py_bytes_upper(resdata, data, len);
9476 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009477}
9478
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009479static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009480handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009481{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009482 Py_ssize_t j;
9483 int final_sigma;
Antoine Pitrouf6d1f1f2015-05-19 21:04:33 +02009484 Py_UCS4 c = 0;
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009485 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009486
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009487 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9488
9489 where ! is a negation and \p{xxx} is a character with property xxx.
9490 */
9491 for (j = i - 1; j >= 0; j--) {
9492 c = PyUnicode_READ(kind, data, j);
9493 if (!_PyUnicode_IsCaseIgnorable(c))
9494 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009495 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009496 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9497 if (final_sigma) {
9498 for (j = i + 1; j < length; j++) {
9499 c = PyUnicode_READ(kind, data, j);
9500 if (!_PyUnicode_IsCaseIgnorable(c))
9501 break;
9502 }
9503 final_sigma = j == length || !_PyUnicode_IsCased(c);
9504 }
9505 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009506}
9507
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009508static int
9509lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9510 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009511{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009512 /* Obscure special case. */
9513 if (c == 0x3A3) {
9514 mapped[0] = handle_capital_sigma(kind, data, length, i);
9515 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009516 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009517 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009518}
9519
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009520static Py_ssize_t
9521do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009522{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009523 Py_ssize_t i, k = 0;
9524 int n_res, j;
9525 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009526
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009527 c = PyUnicode_READ(kind, data, 0);
9528 n_res = _PyUnicode_ToUpperFull(c, mapped);
9529 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009530 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009531 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009532 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009533 for (i = 1; i < length; i++) {
9534 c = PyUnicode_READ(kind, data, i);
9535 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9536 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009537 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009538 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009539 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009540 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009541 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009542}
9543
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009544static Py_ssize_t
9545do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9546 Py_ssize_t i, k = 0;
9547
9548 for (i = 0; i < length; i++) {
9549 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9550 int n_res, j;
9551 if (Py_UNICODE_ISUPPER(c)) {
9552 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9553 }
9554 else if (Py_UNICODE_ISLOWER(c)) {
9555 n_res = _PyUnicode_ToUpperFull(c, mapped);
9556 }
9557 else {
9558 n_res = 1;
9559 mapped[0] = c;
9560 }
9561 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009562 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009563 res[k++] = mapped[j];
9564 }
9565 }
9566 return k;
9567}
9568
9569static Py_ssize_t
9570do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9571 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009572{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009573 Py_ssize_t i, k = 0;
9574
9575 for (i = 0; i < length; i++) {
9576 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9577 int n_res, j;
9578 if (lower)
9579 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9580 else
9581 n_res = _PyUnicode_ToUpperFull(c, mapped);
9582 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009583 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009584 res[k++] = mapped[j];
9585 }
9586 }
9587 return k;
9588}
9589
9590static Py_ssize_t
9591do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9592{
9593 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9594}
9595
9596static Py_ssize_t
9597do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9598{
9599 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9600}
9601
Benjamin Petersone51757f2012-01-12 21:10:29 -05009602static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009603do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9604{
9605 Py_ssize_t i, k = 0;
9606
9607 for (i = 0; i < length; i++) {
9608 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9609 Py_UCS4 mapped[3];
9610 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9611 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009612 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -05009613 res[k++] = mapped[j];
9614 }
9615 }
9616 return k;
9617}
9618
9619static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009620do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9621{
9622 Py_ssize_t i, k = 0;
9623 int previous_is_cased;
9624
9625 previous_is_cased = 0;
9626 for (i = 0; i < length; i++) {
9627 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9628 Py_UCS4 mapped[3];
9629 int n_res, j;
9630
9631 if (previous_is_cased)
9632 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9633 else
9634 n_res = _PyUnicode_ToTitleFull(c, mapped);
9635
9636 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009637 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -05009638 res[k++] = mapped[j];
9639 }
9640
9641 previous_is_cased = _PyUnicode_IsCased(c);
9642 }
9643 return k;
9644}
9645
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009646static PyObject *
9647case_operation(PyObject *self,
9648 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9649{
9650 PyObject *res = NULL;
9651 Py_ssize_t length, newlength = 0;
9652 int kind, outkind;
9653 void *data, *outdata;
9654 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9655
Benjamin Petersoneea48462012-01-16 14:28:50 -05009656 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009657
9658 kind = PyUnicode_KIND(self);
9659 data = PyUnicode_DATA(self);
9660 length = PyUnicode_GET_LENGTH(self);
Antoine Pitroub6dc9b72014-10-15 23:14:53 +02009661 if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
Benjamin Petersone1bd38c2014-10-15 11:47:36 -04009662 PyErr_SetString(PyExc_OverflowError, "string is too long");
9663 return NULL;
9664 }
Benjamin Peterson1e211ff2014-10-15 12:17:21 -04009665 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009666 if (tmp == NULL)
9667 return PyErr_NoMemory();
9668 newlength = perform(kind, data, length, tmp, &maxchar);
9669 res = PyUnicode_New(newlength, maxchar);
9670 if (res == NULL)
9671 goto leave;
9672 tmpend = tmp + newlength;
9673 outdata = PyUnicode_DATA(res);
9674 outkind = PyUnicode_KIND(res);
9675 switch (outkind) {
9676 case PyUnicode_1BYTE_KIND:
9677 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9678 break;
9679 case PyUnicode_2BYTE_KIND:
9680 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9681 break;
9682 case PyUnicode_4BYTE_KIND:
9683 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9684 break;
9685 default:
9686 assert(0);
9687 break;
9688 }
9689 leave:
9690 PyMem_FREE(tmp);
9691 return res;
9692}
9693
Tim Peters8ce9f162004-08-27 01:49:32 +00009694PyObject *
9695PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009696{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009697 PyObject *sep = NULL;
Victor Stinnerdd077322011-10-07 17:02:31 +02009698 Py_ssize_t seplen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009699 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00009700 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009701 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
9702 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00009703 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009704 Py_ssize_t sz, i, res_offset;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009705 Py_UCS4 maxchar;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009706 Py_UCS4 item_maxchar;
Victor Stinnerdd077322011-10-07 17:02:31 +02009707 int use_memcpy;
9708 unsigned char *res_data = NULL, *sep_data = NULL;
9709 PyObject *last_obj;
9710 unsigned int kind = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009711
Benjamin Peterson9743b2c2014-02-15 13:02:52 -05009712 fseq = PySequence_Fast(seq, "can only join an iterable");
Tim Peters05eba1f2004-08-27 21:32:02 +00009713 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009714 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009715 }
9716
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009717 /* NOTE: the following code can't call back into Python code,
9718 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009719 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009720
Tim Peters05eba1f2004-08-27 21:32:02 +00009721 seqlen = PySequence_Fast_GET_SIZE(fseq);
9722 /* If empty sequence, return u"". */
9723 if (seqlen == 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009724 Py_DECREF(fseq);
Serhiy Storchaka678db842013-01-26 12:16:36 +02009725 _Py_RETURN_UNICODE_EMPTY();
Tim Peters05eba1f2004-08-27 21:32:02 +00009726 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009727
Tim Peters05eba1f2004-08-27 21:32:02 +00009728 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009729 last_obj = NULL;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009730 items = PySequence_Fast_ITEMS(fseq);
Victor Stinneracf47b82011-10-06 12:32:37 +02009731 if (seqlen == 1) {
9732 if (PyUnicode_CheckExact(items[0])) {
9733 res = items[0];
9734 Py_INCREF(res);
9735 Py_DECREF(fseq);
9736 return res;
9737 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009738 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009739 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009740 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009741 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009742 /* Set up sep and seplen */
9743 if (separator == NULL) {
9744 /* fall back to a blank space separator */
9745 sep = PyUnicode_FromOrdinal(' ');
9746 if (!sep)
9747 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009748 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009749 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009750 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009751 else {
9752 if (!PyUnicode_Check(separator)) {
9753 PyErr_Format(PyExc_TypeError,
9754 "separator: expected str instance,"
9755 " %.80s found",
9756 Py_TYPE(separator)->tp_name);
9757 goto onError;
9758 }
9759 if (PyUnicode_READY(separator))
9760 goto onError;
9761 sep = separator;
9762 seplen = PyUnicode_GET_LENGTH(separator);
9763 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9764 /* inc refcount to keep this code path symmetric with the
9765 above case of a blank separator */
9766 Py_INCREF(sep);
9767 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009768 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009769 }
9770
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009771 /* There are at least two things to join, or else we have a subclass
9772 * of str in the sequence.
9773 * Do a pre-pass to figure out the total amount of space we'll
9774 * need (sz), and see whether all argument are strings.
9775 */
9776 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009777#ifdef Py_DEBUG
9778 use_memcpy = 0;
9779#else
9780 use_memcpy = 1;
9781#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009782 for (i = 0; i < seqlen; i++) {
9783 const Py_ssize_t old_sz = sz;
9784 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009785 if (!PyUnicode_Check(item)) {
9786 PyErr_Format(PyExc_TypeError,
Victor Stinnera33bce02014-07-04 22:47:46 +02009787 "sequence item %zd: expected str instance,"
Benjamin Peterson29060642009-01-31 22:14:21 +00009788 " %.80s found",
9789 i, Py_TYPE(item)->tp_name);
9790 goto onError;
9791 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009792 if (PyUnicode_READY(item) == -1)
9793 goto onError;
9794 sz += PyUnicode_GET_LENGTH(item);
9795 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Benjamin Peterson7e303732013-06-10 09:19:46 -07009796 maxchar = Py_MAX(maxchar, item_maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009797 if (i != 0)
9798 sz += seplen;
9799 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9800 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009801 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009802 goto onError;
9803 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009804 if (use_memcpy && last_obj != NULL) {
9805 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9806 use_memcpy = 0;
9807 }
9808 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009809 }
Tim Petersced69f82003-09-16 20:30:58 +00009810
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009811 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009812 if (res == NULL)
9813 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009814
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009815 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009816#ifdef Py_DEBUG
9817 use_memcpy = 0;
9818#else
9819 if (use_memcpy) {
9820 res_data = PyUnicode_1BYTE_DATA(res);
9821 kind = PyUnicode_KIND(res);
9822 if (seplen != 0)
9823 sep_data = PyUnicode_1BYTE_DATA(sep);
9824 }
9825#endif
Victor Stinner4560f9c2013-04-14 18:56:46 +02009826 if (use_memcpy) {
9827 for (i = 0; i < seqlen; ++i) {
9828 Py_ssize_t itemlen;
9829 item = items[i];
9830
9831 /* Copy item, and maybe the separator. */
9832 if (i && seplen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009833 Py_MEMCPY(res_data,
9834 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009835 kind * seplen);
9836 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009837 }
Victor Stinner4560f9c2013-04-14 18:56:46 +02009838
9839 itemlen = PyUnicode_GET_LENGTH(item);
9840 if (itemlen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009841 Py_MEMCPY(res_data,
9842 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009843 kind * itemlen);
9844 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009845 }
Victor Stinner4560f9c2013-04-14 18:56:46 +02009846 }
9847 assert(res_data == PyUnicode_1BYTE_DATA(res)
9848 + kind * PyUnicode_GET_LENGTH(res));
9849 }
9850 else {
9851 for (i = 0, res_offset = 0; i < seqlen; ++i) {
9852 Py_ssize_t itemlen;
9853 item = items[i];
9854
9855 /* Copy item, and maybe the separator. */
9856 if (i && seplen != 0) {
9857 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
9858 res_offset += seplen;
9859 }
9860
9861 itemlen = PyUnicode_GET_LENGTH(item);
9862 if (itemlen != 0) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009863 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +02009864 res_offset += itemlen;
9865 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009866 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009867 assert(res_offset == PyUnicode_GET_LENGTH(res));
Victor Stinner4560f9c2013-04-14 18:56:46 +02009868 }
Tim Peters8ce9f162004-08-27 01:49:32 +00009869
Tim Peters05eba1f2004-08-27 21:32:02 +00009870 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009871 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009872 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009873 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009874
Benjamin Peterson29060642009-01-31 22:14:21 +00009875 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00009876 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009877 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00009878 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009879 return NULL;
9880}
9881
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009882#define FILL(kind, data, value, start, length) \
9883 do { \
9884 Py_ssize_t i_ = 0; \
9885 assert(kind != PyUnicode_WCHAR_KIND); \
9886 switch ((kind)) { \
9887 case PyUnicode_1BYTE_KIND: { \
9888 unsigned char * to_ = (unsigned char *)((data)) + (start); \
Victor Stinnerf2c76aa2012-05-03 13:10:40 +02009889 memset(to_, (unsigned char)value, (length)); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009890 break; \
9891 } \
9892 case PyUnicode_2BYTE_KIND: { \
9893 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
9894 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9895 break; \
9896 } \
Benjamin Petersone157cf12012-01-01 15:56:20 -06009897 case PyUnicode_4BYTE_KIND: { \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009898 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9899 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9900 break; \
9901 } \
Serhiy Storchaka133b11b2014-12-01 18:56:28 +02009902 default: assert(0); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009903 } \
9904 } while (0)
9905
Victor Stinnerd3f08822012-05-29 12:57:52 +02009906void
9907_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9908 Py_UCS4 fill_char)
9909{
9910 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
9911 const void *data = PyUnicode_DATA(unicode);
9912 assert(PyUnicode_IS_READY(unicode));
9913 assert(unicode_modifiable(unicode));
9914 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
9915 assert(start >= 0);
9916 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
9917 FILL(kind, data, fill_char, start, length);
9918}
9919
Victor Stinner3fe55312012-01-04 00:33:50 +01009920Py_ssize_t
9921PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9922 Py_UCS4 fill_char)
9923{
9924 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +01009925
9926 if (!PyUnicode_Check(unicode)) {
9927 PyErr_BadInternalCall();
9928 return -1;
9929 }
9930 if (PyUnicode_READY(unicode) == -1)
9931 return -1;
9932 if (unicode_check_modifiable(unicode))
9933 return -1;
9934
Victor Stinnerd3f08822012-05-29 12:57:52 +02009935 if (start < 0) {
9936 PyErr_SetString(PyExc_IndexError, "string index out of range");
9937 return -1;
9938 }
Victor Stinner3fe55312012-01-04 00:33:50 +01009939 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
9940 PyErr_SetString(PyExc_ValueError,
9941 "fill character is bigger than "
9942 "the string maximum character");
9943 return -1;
9944 }
9945
9946 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
9947 length = Py_MIN(maxlen, length);
9948 if (length <= 0)
9949 return 0;
9950
Victor Stinnerd3f08822012-05-29 12:57:52 +02009951 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +01009952 return length;
9953}
9954
Victor Stinner9310abb2011-10-05 00:59:23 +02009955static PyObject *
9956pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009957 Py_ssize_t left,
9958 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009959 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009960{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009961 PyObject *u;
9962 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009963 int kind;
9964 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009965
9966 if (left < 0)
9967 left = 0;
9968 if (right < 0)
9969 right = 0;
9970
Victor Stinnerc4b49542011-12-11 22:44:26 +01009971 if (left == 0 && right == 0)
9972 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009973
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009974 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9975 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00009976 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9977 return NULL;
9978 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009979 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Benjamin Peterson7e303732013-06-10 09:19:46 -07009980 maxchar = Py_MAX(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009981 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009982 if (!u)
9983 return NULL;
9984
9985 kind = PyUnicode_KIND(u);
9986 data = PyUnicode_DATA(u);
9987 if (left)
9988 FILL(kind, data, fill, 0, left);
9989 if (right)
9990 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +02009991 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009992 assert(_PyUnicode_CheckConsistency(u, 1));
9993 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009994}
9995
Alexander Belopolsky40018472011-02-26 01:02:56 +00009996PyObject *
9997PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009998{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009999 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010000
10001 string = PyUnicode_FromObject(string);
Benjamin Peterson22a29702012-01-02 09:00:30 -060010002 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010003 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060010004 if (PyUnicode_READY(string) == -1) {
10005 Py_DECREF(string);
10006 return NULL;
10007 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010008
Benjamin Petersonead6b532011-12-20 17:23:42 -060010009 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010010 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010011 if (PyUnicode_IS_ASCII(string))
10012 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010013 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010014 PyUnicode_GET_LENGTH(string), keepends);
10015 else
10016 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010017 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010018 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010019 break;
10020 case PyUnicode_2BYTE_KIND:
10021 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010022 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010023 PyUnicode_GET_LENGTH(string), keepends);
10024 break;
10025 case PyUnicode_4BYTE_KIND:
10026 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010027 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010028 PyUnicode_GET_LENGTH(string), keepends);
10029 break;
10030 default:
10031 assert(0);
10032 list = 0;
10033 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010034 Py_DECREF(string);
10035 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010036}
10037
Alexander Belopolsky40018472011-02-26 01:02:56 +000010038static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010039split(PyObject *self,
10040 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010041 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010042{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010043 int kind1, kind2, kind;
10044 void *buf1, *buf2;
10045 Py_ssize_t len1, len2;
10046 PyObject* out;
10047
Guido van Rossumd57fd912000-03-10 22:53:23 +000010048 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010049 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010050
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010051 if (PyUnicode_READY(self) == -1)
10052 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010053
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010054 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010055 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010056 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010057 if (PyUnicode_IS_ASCII(self))
10058 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010059 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010060 PyUnicode_GET_LENGTH(self), maxcount
10061 );
10062 else
10063 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010064 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010065 PyUnicode_GET_LENGTH(self), maxcount
10066 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010067 case PyUnicode_2BYTE_KIND:
10068 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010069 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010070 PyUnicode_GET_LENGTH(self), maxcount
10071 );
10072 case PyUnicode_4BYTE_KIND:
10073 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010074 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010075 PyUnicode_GET_LENGTH(self), maxcount
10076 );
10077 default:
10078 assert(0);
10079 return NULL;
10080 }
10081
10082 if (PyUnicode_READY(substring) == -1)
10083 return NULL;
10084
10085 kind1 = PyUnicode_KIND(self);
10086 kind2 = PyUnicode_KIND(substring);
10087 kind = kind1 > kind2 ? kind1 : kind2;
10088 buf1 = PyUnicode_DATA(self);
10089 buf2 = PyUnicode_DATA(substring);
10090 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010091 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010092 if (!buf1)
10093 return NULL;
10094 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010095 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010096 if (!buf2) {
10097 if (kind1 != kind) PyMem_Free(buf1);
10098 return NULL;
10099 }
10100 len1 = PyUnicode_GET_LENGTH(self);
10101 len2 = PyUnicode_GET_LENGTH(substring);
10102
Benjamin Petersonead6b532011-12-20 17:23:42 -060010103 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010104 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010105 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10106 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010107 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010108 else
10109 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010110 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010111 break;
10112 case PyUnicode_2BYTE_KIND:
10113 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010114 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010115 break;
10116 case PyUnicode_4BYTE_KIND:
10117 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010118 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010119 break;
10120 default:
10121 out = NULL;
10122 }
10123 if (kind1 != kind)
10124 PyMem_Free(buf1);
10125 if (kind2 != kind)
10126 PyMem_Free(buf2);
10127 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010128}
10129
Alexander Belopolsky40018472011-02-26 01:02:56 +000010130static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010131rsplit(PyObject *self,
10132 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010133 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010134{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010135 int kind1, kind2, kind;
10136 void *buf1, *buf2;
10137 Py_ssize_t len1, len2;
10138 PyObject* out;
10139
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010140 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010141 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010142
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010143 if (PyUnicode_READY(self) == -1)
10144 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010145
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010146 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010147 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010148 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010149 if (PyUnicode_IS_ASCII(self))
10150 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010151 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010152 PyUnicode_GET_LENGTH(self), maxcount
10153 );
10154 else
10155 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010156 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010157 PyUnicode_GET_LENGTH(self), maxcount
10158 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010159 case PyUnicode_2BYTE_KIND:
10160 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010161 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010162 PyUnicode_GET_LENGTH(self), maxcount
10163 );
10164 case PyUnicode_4BYTE_KIND:
10165 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010166 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010167 PyUnicode_GET_LENGTH(self), maxcount
10168 );
10169 default:
10170 assert(0);
10171 return NULL;
10172 }
10173
10174 if (PyUnicode_READY(substring) == -1)
10175 return NULL;
10176
10177 kind1 = PyUnicode_KIND(self);
10178 kind2 = PyUnicode_KIND(substring);
10179 kind = kind1 > kind2 ? kind1 : kind2;
10180 buf1 = PyUnicode_DATA(self);
10181 buf2 = PyUnicode_DATA(substring);
10182 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010183 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010184 if (!buf1)
10185 return NULL;
10186 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010187 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010188 if (!buf2) {
10189 if (kind1 != kind) PyMem_Free(buf1);
10190 return NULL;
10191 }
10192 len1 = PyUnicode_GET_LENGTH(self);
10193 len2 = PyUnicode_GET_LENGTH(substring);
10194
Benjamin Petersonead6b532011-12-20 17:23:42 -060010195 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010196 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010197 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10198 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010199 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010200 else
10201 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010202 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010203 break;
10204 case PyUnicode_2BYTE_KIND:
10205 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010206 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010207 break;
10208 case PyUnicode_4BYTE_KIND:
10209 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010210 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010211 break;
10212 default:
10213 out = NULL;
10214 }
10215 if (kind1 != kind)
10216 PyMem_Free(buf1);
10217 if (kind2 != kind)
10218 PyMem_Free(buf2);
10219 return out;
10220}
10221
10222static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010223anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10224 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010225{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010226 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010227 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010228 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10229 return asciilib_find(buf1, len1, buf2, len2, offset);
10230 else
10231 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010232 case PyUnicode_2BYTE_KIND:
10233 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10234 case PyUnicode_4BYTE_KIND:
10235 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10236 }
10237 assert(0);
10238 return -1;
10239}
10240
10241static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010242anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10243 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010244{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010245 switch (kind) {
10246 case PyUnicode_1BYTE_KIND:
10247 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10248 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10249 else
10250 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10251 case PyUnicode_2BYTE_KIND:
10252 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10253 case PyUnicode_4BYTE_KIND:
10254 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10255 }
10256 assert(0);
10257 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010258}
10259
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010260static void
10261replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10262 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10263{
10264 int kind = PyUnicode_KIND(u);
10265 void *data = PyUnicode_DATA(u);
10266 Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10267 if (kind == PyUnicode_1BYTE_KIND) {
10268 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10269 (Py_UCS1 *)data + len,
10270 u1, u2, maxcount);
10271 }
10272 else if (kind == PyUnicode_2BYTE_KIND) {
10273 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10274 (Py_UCS2 *)data + len,
10275 u1, u2, maxcount);
10276 }
10277 else {
10278 assert(kind == PyUnicode_4BYTE_KIND);
10279 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10280 (Py_UCS4 *)data + len,
10281 u1, u2, maxcount);
10282 }
10283}
10284
Alexander Belopolsky40018472011-02-26 01:02:56 +000010285static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010286replace(PyObject *self, PyObject *str1,
10287 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010288{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010289 PyObject *u;
10290 char *sbuf = PyUnicode_DATA(self);
10291 char *buf1 = PyUnicode_DATA(str1);
10292 char *buf2 = PyUnicode_DATA(str2);
10293 int srelease = 0, release1 = 0, release2 = 0;
10294 int skind = PyUnicode_KIND(self);
10295 int kind1 = PyUnicode_KIND(str1);
10296 int kind2 = PyUnicode_KIND(str2);
10297 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10298 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10299 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010300 int mayshrink;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010301 Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010302
10303 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010304 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010305 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010306 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010307
Victor Stinner59de0ee2011-10-07 10:01:28 +020010308 if (str1 == str2)
10309 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010310
Victor Stinner49a0a212011-10-12 23:46:10 +020010311 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010312 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10313 if (maxchar < maxchar_str1)
10314 /* substring too wide to be present */
10315 goto nothing;
Victor Stinner49a0a212011-10-12 23:46:10 +020010316 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10317 /* Replacing str1 with str2 may cause a maxchar reduction in the
10318 result string. */
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010319 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010320 maxchar = Py_MAX(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010321
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010322 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010323 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010324 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010325 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010326 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010327 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010328 Py_UCS4 u1, u2;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010329 Py_ssize_t pos;
Victor Stinnerf6441102011-12-18 02:43:08 +010010330
Victor Stinner69ed0f42013-04-09 21:48:24 +020010331 u1 = PyUnicode_READ(kind1, buf1, 0);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010332 pos = findchar(sbuf, skind, slen, u1, 1);
Victor Stinnerf6441102011-12-18 02:43:08 +010010333 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010334 goto nothing;
Victor Stinner69ed0f42013-04-09 21:48:24 +020010335 u2 = PyUnicode_READ(kind2, buf2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010336 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010337 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010338 goto error;
Victor Stinnerf6441102011-12-18 02:43:08 +010010339
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010340 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10341 replace_1char_inplace(u, pos, u1, u2, maxcount);
Victor Stinner49a0a212011-10-12 23:46:10 +020010342 }
10343 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010344 int rkind = skind;
10345 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010346 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010347
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010348 if (kind1 < rkind) {
10349 /* widen substring */
10350 buf1 = _PyUnicode_AsKind(str1, rkind);
10351 if (!buf1) goto error;
10352 release1 = 1;
10353 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010354 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010355 if (i < 0)
10356 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010357 if (rkind > kind2) {
10358 /* widen replacement */
10359 buf2 = _PyUnicode_AsKind(str2, rkind);
10360 if (!buf2) goto error;
10361 release2 = 1;
10362 }
10363 else if (rkind < kind2) {
10364 /* widen self and buf1 */
10365 rkind = kind2;
10366 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010367 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010368 sbuf = _PyUnicode_AsKind(self, rkind);
10369 if (!sbuf) goto error;
10370 srelease = 1;
10371 buf1 = _PyUnicode_AsKind(str1, rkind);
10372 if (!buf1) goto error;
10373 release1 = 1;
10374 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010375 u = PyUnicode_New(slen, maxchar);
10376 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010377 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010378 assert(PyUnicode_KIND(u) == rkind);
10379 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010380
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010381 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010382 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010383 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010384 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010385 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010386 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010387
10388 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010389 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010390 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010391 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010392 if (i == -1)
10393 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010394 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010395 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010396 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010397 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010398 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010399 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010400 }
10401 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010402 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010403 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010404 int rkind = skind;
10405 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010406
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010407 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010408 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010409 buf1 = _PyUnicode_AsKind(str1, rkind);
10410 if (!buf1) goto error;
10411 release1 = 1;
10412 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010413 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010414 if (n == 0)
10415 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010416 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010417 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010418 buf2 = _PyUnicode_AsKind(str2, rkind);
10419 if (!buf2) goto error;
10420 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010421 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010422 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010423 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010424 rkind = kind2;
10425 sbuf = _PyUnicode_AsKind(self, rkind);
10426 if (!sbuf) goto error;
10427 srelease = 1;
10428 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010429 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010430 buf1 = _PyUnicode_AsKind(str1, rkind);
10431 if (!buf1) goto error;
10432 release1 = 1;
10433 }
10434 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10435 PyUnicode_GET_LENGTH(str1))); */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010436 if (len2 > len1 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010437 PyErr_SetString(PyExc_OverflowError,
10438 "replace string is too long");
10439 goto error;
10440 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010441 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010442 if (new_size == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010443 _Py_INCREF_UNICODE_EMPTY();
10444 if (!unicode_empty)
10445 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010446 u = unicode_empty;
10447 goto done;
10448 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010449 if (new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010450 PyErr_SetString(PyExc_OverflowError,
10451 "replace string is too long");
10452 goto error;
10453 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010454 u = PyUnicode_New(new_size, maxchar);
10455 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010456 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010457 assert(PyUnicode_KIND(u) == rkind);
10458 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010459 ires = i = 0;
10460 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010461 while (n-- > 0) {
10462 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010463 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010464 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010465 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010466 if (j == -1)
10467 break;
10468 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010469 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010470 memcpy(res + rkind * ires,
10471 sbuf + rkind * i,
10472 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010473 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010474 }
10475 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010476 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010477 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010478 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010479 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010480 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010481 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010482 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010483 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010484 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010485 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010486 memcpy(res + rkind * ires,
10487 sbuf + rkind * i,
10488 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010489 }
10490 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010491 /* interleave */
10492 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010493 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010494 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010495 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010496 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010497 if (--n <= 0)
10498 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010499 memcpy(res + rkind * ires,
10500 sbuf + rkind * i,
10501 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010502 ires++;
10503 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010504 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010505 memcpy(res + rkind * ires,
10506 sbuf + rkind * i,
10507 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010508 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010509 }
10510
10511 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010512 unicode_adjust_maxchar(&u);
10513 if (u == NULL)
10514 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010515 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010516
10517 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010518 if (srelease)
10519 PyMem_FREE(sbuf);
10520 if (release1)
10521 PyMem_FREE(buf1);
10522 if (release2)
10523 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010524 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010525 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010526
Benjamin Peterson29060642009-01-31 22:14:21 +000010527 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010528 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010529 if (srelease)
10530 PyMem_FREE(sbuf);
10531 if (release1)
10532 PyMem_FREE(buf1);
10533 if (release2)
10534 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010535 return unicode_result_unchanged(self);
10536
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010537 error:
10538 if (srelease && sbuf)
10539 PyMem_FREE(sbuf);
10540 if (release1 && buf1)
10541 PyMem_FREE(buf1);
10542 if (release2 && buf2)
10543 PyMem_FREE(buf2);
10544 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010545}
10546
10547/* --- Unicode Object Methods --------------------------------------------- */
10548
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010549PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010550 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010551\n\
10552Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010553characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010554
10555static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010556unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010557{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010558 if (PyUnicode_READY(self) == -1)
10559 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010560 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010561}
10562
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010563PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010564 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010565\n\
10566Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +000010567have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010568
10569static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010570unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010571{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010572 if (PyUnicode_READY(self) == -1)
10573 return NULL;
10574 if (PyUnicode_GET_LENGTH(self) == 0)
10575 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010576 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010577}
10578
Benjamin Petersond5890c82012-01-14 13:23:30 -050010579PyDoc_STRVAR(casefold__doc__,
10580 "S.casefold() -> str\n\
10581\n\
10582Return a version of S suitable for caseless comparisons.");
10583
10584static PyObject *
10585unicode_casefold(PyObject *self)
10586{
10587 if (PyUnicode_READY(self) == -1)
10588 return NULL;
10589 if (PyUnicode_IS_ASCII(self))
10590 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010591 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010592}
10593
10594
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010595/* Argument converter. Coerces to a single unicode character */
10596
10597static int
10598convert_uc(PyObject *obj, void *addr)
10599{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010600 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010601 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010602
Benjamin Peterson14339b62009-01-31 16:36:08 +000010603 uniobj = PyUnicode_FromObject(obj);
10604 if (uniobj == NULL) {
10605 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010606 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010607 return 0;
10608 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010609 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010610 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010611 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010612 Py_DECREF(uniobj);
10613 return 0;
10614 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010615 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010616 Py_DECREF(uniobj);
10617 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010618}
10619
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010620PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010621 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010622\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010623Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010624done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010625
10626static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010627unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010628{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010629 Py_ssize_t marg, left;
10630 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010631 Py_UCS4 fillchar = ' ';
10632
Victor Stinnere9a29352011-10-01 02:14:59 +020010633 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010634 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010635
Benjamin Petersonbac79492012-01-14 13:34:47 -050010636 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010637 return NULL;
10638
Victor Stinnerc4b49542011-12-11 22:44:26 +010010639 if (PyUnicode_GET_LENGTH(self) >= width)
10640 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010641
Victor Stinnerc4b49542011-12-11 22:44:26 +010010642 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010643 left = marg / 2 + (marg & width & 1);
10644
Victor Stinner9310abb2011-10-05 00:59:23 +020010645 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010646}
10647
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010648/* This function assumes that str1 and str2 are readied by the caller. */
10649
Marc-André Lemburge5034372000-08-08 08:04:29 +000010650static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010651unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010652{
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010653#define COMPARE(TYPE1, TYPE2) \
10654 do { \
10655 TYPE1* p1 = (TYPE1 *)data1; \
10656 TYPE2* p2 = (TYPE2 *)data2; \
10657 TYPE1* end = p1 + len; \
10658 Py_UCS4 c1, c2; \
10659 for (; p1 != end; p1++, p2++) { \
10660 c1 = *p1; \
10661 c2 = *p2; \
10662 if (c1 != c2) \
10663 return (c1 < c2) ? -1 : 1; \
10664 } \
10665 } \
10666 while (0)
10667
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010668 int kind1, kind2;
10669 void *data1, *data2;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010670 Py_ssize_t len1, len2, len;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010671
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010672 kind1 = PyUnicode_KIND(str1);
10673 kind2 = PyUnicode_KIND(str2);
10674 data1 = PyUnicode_DATA(str1);
10675 data2 = PyUnicode_DATA(str2);
10676 len1 = PyUnicode_GET_LENGTH(str1);
10677 len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner770e19e2012-10-04 22:59:45 +020010678 len = Py_MIN(len1, len2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010679
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010680 switch(kind1) {
10681 case PyUnicode_1BYTE_KIND:
10682 {
10683 switch(kind2) {
10684 case PyUnicode_1BYTE_KIND:
10685 {
10686 int cmp = memcmp(data1, data2, len);
10687 /* normalize result of memcmp() into the range [-1; 1] */
10688 if (cmp < 0)
10689 return -1;
10690 if (cmp > 0)
10691 return 1;
10692 break;
Victor Stinner770e19e2012-10-04 22:59:45 +020010693 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010694 case PyUnicode_2BYTE_KIND:
10695 COMPARE(Py_UCS1, Py_UCS2);
10696 break;
10697 case PyUnicode_4BYTE_KIND:
10698 COMPARE(Py_UCS1, Py_UCS4);
10699 break;
10700 default:
10701 assert(0);
10702 }
10703 break;
10704 }
10705 case PyUnicode_2BYTE_KIND:
10706 {
10707 switch(kind2) {
10708 case PyUnicode_1BYTE_KIND:
10709 COMPARE(Py_UCS2, Py_UCS1);
10710 break;
10711 case PyUnicode_2BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010712 {
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010713 COMPARE(Py_UCS2, Py_UCS2);
10714 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010715 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010716 case PyUnicode_4BYTE_KIND:
10717 COMPARE(Py_UCS2, Py_UCS4);
10718 break;
10719 default:
10720 assert(0);
10721 }
10722 break;
10723 }
10724 case PyUnicode_4BYTE_KIND:
10725 {
10726 switch(kind2) {
10727 case PyUnicode_1BYTE_KIND:
10728 COMPARE(Py_UCS4, Py_UCS1);
10729 break;
10730 case PyUnicode_2BYTE_KIND:
10731 COMPARE(Py_UCS4, Py_UCS2);
10732 break;
10733 case PyUnicode_4BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010734 {
10735#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
10736 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
10737 /* normalize result of wmemcmp() into the range [-1; 1] */
10738 if (cmp < 0)
10739 return -1;
10740 if (cmp > 0)
10741 return 1;
10742#else
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010743 COMPARE(Py_UCS4, Py_UCS4);
Victor Stinnercd777ea2013-04-08 22:43:44 +020010744#endif
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010745 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010746 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010747 default:
10748 assert(0);
10749 }
10750 break;
10751 }
10752 default:
10753 assert(0);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010754 }
10755
Victor Stinner770e19e2012-10-04 22:59:45 +020010756 if (len1 == len2)
10757 return 0;
10758 if (len1 < len2)
10759 return -1;
10760 else
10761 return 1;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010762
10763#undef COMPARE
Marc-André Lemburge5034372000-08-08 08:04:29 +000010764}
10765
Victor Stinnerc8bc5372013-11-04 11:08:10 +010010766Py_LOCAL(int)
Victor Stinnere5567ad2012-10-23 02:48:49 +020010767unicode_compare_eq(PyObject *str1, PyObject *str2)
10768{
10769 int kind;
10770 void *data1, *data2;
10771 Py_ssize_t len;
10772 int cmp;
10773
Victor Stinnere5567ad2012-10-23 02:48:49 +020010774 len = PyUnicode_GET_LENGTH(str1);
10775 if (PyUnicode_GET_LENGTH(str2) != len)
10776 return 0;
10777 kind = PyUnicode_KIND(str1);
10778 if (PyUnicode_KIND(str2) != kind)
10779 return 0;
10780 data1 = PyUnicode_DATA(str1);
10781 data2 = PyUnicode_DATA(str2);
10782
10783 cmp = memcmp(data1, data2, len * kind);
10784 return (cmp == 0);
10785}
10786
10787
Alexander Belopolsky40018472011-02-26 01:02:56 +000010788int
10789PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010790{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010791 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10792 if (PyUnicode_READY(left) == -1 ||
10793 PyUnicode_READY(right) == -1)
10794 return -1;
Victor Stinnerf0c7b2a2013-11-04 11:27:14 +010010795
10796 /* a string is equal to itself */
10797 if (left == right)
10798 return 0;
10799
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010800 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010801 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010802 PyErr_Format(PyExc_TypeError,
10803 "Can't compare %.100s and %.100s",
10804 left->ob_type->tp_name,
10805 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010806 return -1;
10807}
10808
Martin v. Löwis5b222132007-06-10 09:51:05 +000010809int
Victor Stinnerad14ccd2013-11-07 00:46:04 +010010810_PyUnicode_CompareWithId(PyObject *left, _Py_Identifier *right)
10811{
10812 PyObject *right_str = _PyUnicode_FromId(right); /* borrowed */
10813 if (right_str == NULL)
10814 return -1;
10815 return PyUnicode_Compare(left, right_str);
10816}
10817
10818int
Martin v. Löwis5b222132007-06-10 09:51:05 +000010819PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10820{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010821 Py_ssize_t i;
10822 int kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010823 Py_UCS4 chr;
10824
Victor Stinner910337b2011-10-03 03:20:16 +020010825 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010826 if (PyUnicode_READY(uni) == -1)
10827 return -1;
10828 kind = PyUnicode_KIND(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010010829 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnera6b9b072013-10-30 18:27:13 +010010830 const void *data = PyUnicode_1BYTE_DATA(uni);
Victor Stinnere1b15922013-11-03 13:53:12 +010010831 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010010832 size_t len, len2 = strlen(str);
10833 int cmp;
10834
10835 len = Py_MIN(len1, len2);
10836 cmp = memcmp(data, str, len);
Victor Stinner21ea21e2013-11-04 11:28:26 +010010837 if (cmp != 0) {
10838 if (cmp < 0)
10839 return -1;
10840 else
10841 return 1;
10842 }
Victor Stinner602f7cf2013-10-29 23:31:50 +010010843 if (len1 > len2)
10844 return 1; /* uni is longer */
10845 if (len2 > len1)
10846 return -1; /* str is longer */
10847 return 0;
10848 }
10849 else {
10850 void *data = PyUnicode_DATA(uni);
10851 /* Compare Unicode string and source character set string */
10852 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
10853 if (chr != str[i])
10854 return (chr < (unsigned char)(str[i])) ? -1 : 1;
10855 /* This check keeps Python strings that end in '\0' from comparing equal
10856 to C strings identical up to that point. */
10857 if (PyUnicode_GET_LENGTH(uni) != i || chr)
10858 return 1; /* uni is longer */
10859 if (str[i])
10860 return -1; /* str is longer */
10861 return 0;
10862 }
Martin v. Löwis5b222132007-06-10 09:51:05 +000010863}
10864
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010865
Benjamin Peterson29060642009-01-31 22:14:21 +000010866#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000010867 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010868
Alexander Belopolsky40018472011-02-26 01:02:56 +000010869PyObject *
10870PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010871{
10872 int result;
Victor Stinnere5567ad2012-10-23 02:48:49 +020010873 PyObject *v;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010874
Victor Stinnere5567ad2012-10-23 02:48:49 +020010875 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
10876 Py_RETURN_NOTIMPLEMENTED;
10877
10878 if (PyUnicode_READY(left) == -1 ||
10879 PyUnicode_READY(right) == -1)
10880 return NULL;
10881
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010010882 if (left == right) {
10883 switch (op) {
10884 case Py_EQ:
10885 case Py_LE:
10886 case Py_GE:
10887 /* a string is equal to itself */
10888 v = Py_True;
10889 break;
10890 case Py_NE:
10891 case Py_LT:
10892 case Py_GT:
10893 v = Py_False;
10894 break;
10895 default:
10896 PyErr_BadArgument();
10897 return NULL;
10898 }
10899 }
10900 else if (op == Py_EQ || op == Py_NE) {
Victor Stinnere5567ad2012-10-23 02:48:49 +020010901 result = unicode_compare_eq(left, right);
Victor Stinnerc8bc5372013-11-04 11:08:10 +010010902 result ^= (op == Py_NE);
10903 v = TEST_COND(result);
Victor Stinnere5567ad2012-10-23 02:48:49 +020010904 }
10905 else {
Victor Stinner90db9c42012-10-04 21:53:50 +020010906 result = unicode_compare(left, right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010907
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010908 /* Convert the return value to a Boolean */
10909 switch (op) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010910 case Py_LE:
10911 v = TEST_COND(result <= 0);
10912 break;
10913 case Py_GE:
10914 v = TEST_COND(result >= 0);
10915 break;
10916 case Py_LT:
10917 v = TEST_COND(result == -1);
10918 break;
10919 case Py_GT:
10920 v = TEST_COND(result == 1);
10921 break;
10922 default:
10923 PyErr_BadArgument();
10924 return NULL;
10925 }
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010926 }
Victor Stinnere5567ad2012-10-23 02:48:49 +020010927 Py_INCREF(v);
10928 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010929}
10930
Alexander Belopolsky40018472011-02-26 01:02:56 +000010931int
10932PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +000010933{
Thomas Wouters477c8d52006-05-27 19:21:47 +000010934 PyObject *str, *sub;
Victor Stinner77282cb2013-04-14 19:22:47 +020010935 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010936 void *buf1, *buf2;
10937 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010938 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010939
10940 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +000010941 sub = PyUnicode_FromObject(element);
10942 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010943 PyErr_Format(PyExc_TypeError,
10944 "'in <string>' requires string as left operand, not %s",
10945 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010946 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010947 }
10948
Thomas Wouters477c8d52006-05-27 19:21:47 +000010949 str = PyUnicode_FromObject(container);
Benjamin Peterson22a29702012-01-02 09:00:30 -060010950 if (!str) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010951 Py_DECREF(sub);
10952 return -1;
10953 }
10954
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010955 kind1 = PyUnicode_KIND(str);
10956 kind2 = PyUnicode_KIND(sub);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010957 buf1 = PyUnicode_DATA(str);
10958 buf2 = PyUnicode_DATA(sub);
Victor Stinner77282cb2013-04-14 19:22:47 +020010959 if (kind2 != kind1) {
10960 if (kind2 > kind1) {
Antoine Pitrou758153b2012-05-12 15:51:51 +020010961 Py_DECREF(sub);
10962 Py_DECREF(str);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010963 return 0;
Antoine Pitrou758153b2012-05-12 15:51:51 +020010964 }
Victor Stinner77282cb2013-04-14 19:22:47 +020010965 buf2 = _PyUnicode_AsKind(sub, kind1);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010966 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010967 if (!buf2) {
10968 Py_DECREF(sub);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010969 Py_DECREF(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010970 return -1;
10971 }
10972 len1 = PyUnicode_GET_LENGTH(str);
10973 len2 = PyUnicode_GET_LENGTH(sub);
10974
Victor Stinner77282cb2013-04-14 19:22:47 +020010975 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010976 case PyUnicode_1BYTE_KIND:
10977 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10978 break;
10979 case PyUnicode_2BYTE_KIND:
10980 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10981 break;
10982 case PyUnicode_4BYTE_KIND:
10983 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
10984 break;
10985 default:
10986 result = -1;
10987 assert(0);
10988 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010989
10990 Py_DECREF(str);
10991 Py_DECREF(sub);
10992
Victor Stinner77282cb2013-04-14 19:22:47 +020010993 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010994 PyMem_Free(buf2);
10995
Guido van Rossum403d68b2000-03-13 15:55:09 +000010996 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010997}
10998
Guido van Rossumd57fd912000-03-10 22:53:23 +000010999/* Concat to string or Unicode object giving a new Unicode object. */
11000
Alexander Belopolsky40018472011-02-26 01:02:56 +000011001PyObject *
11002PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011003{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011004 PyObject *u = NULL, *v = NULL, *w;
Victor Stinner127226b2011-10-13 01:12:34 +020011005 Py_UCS4 maxchar, maxchar2;
Victor Stinner488fa492011-12-12 00:01:39 +010011006 Py_ssize_t u_len, v_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011007
11008 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011009 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011010 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000011011 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011012 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011013 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000011014 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011015
11016 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +020011017 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011018 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011019 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011020 }
Victor Stinnera464fc12011-10-02 20:39:30 +020011021 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011022 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011023 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011024 }
11025
Victor Stinner488fa492011-12-12 00:01:39 +010011026 u_len = PyUnicode_GET_LENGTH(u);
11027 v_len = PyUnicode_GET_LENGTH(v);
11028 if (u_len > PY_SSIZE_T_MAX - v_len) {
11029 PyErr_SetString(PyExc_OverflowError,
11030 "strings are too large to concat");
11031 goto onError;
11032 }
11033 new_len = u_len + v_len;
11034
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011035 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinner127226b2011-10-13 01:12:34 +020011036 maxchar2 = PyUnicode_MAX_CHAR_VALUE(v);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011037 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011038
Guido van Rossumd57fd912000-03-10 22:53:23 +000011039 /* Concat the two Unicode strings */
Victor Stinner488fa492011-12-12 00:01:39 +010011040 w = PyUnicode_New(new_len, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011041 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000011042 goto onError;
Victor Stinnerd3f08822012-05-29 12:57:52 +020011043 _PyUnicode_FastCopyCharacters(w, 0, u, 0, u_len);
11044 _PyUnicode_FastCopyCharacters(w, u_len, v, 0, v_len);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011045 Py_DECREF(u);
11046 Py_DECREF(v);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011047 assert(_PyUnicode_CheckConsistency(w, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011048 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011049
Benjamin Peterson29060642009-01-31 22:14:21 +000011050 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000011051 Py_XDECREF(u);
11052 Py_XDECREF(v);
11053 return NULL;
11054}
11055
Walter Dörwald1ab83302007-05-18 17:15:44 +000011056void
Victor Stinner23e56682011-10-03 03:54:37 +020011057PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000011058{
Victor Stinner23e56682011-10-03 03:54:37 +020011059 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010011060 Py_UCS4 maxchar, maxchar2;
11061 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020011062
11063 if (p_left == NULL) {
11064 if (!PyErr_Occurred())
11065 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000011066 return;
11067 }
Victor Stinner23e56682011-10-03 03:54:37 +020011068 left = *p_left;
Victor Stinnerf0335102013-04-14 19:13:03 +020011069 if (right == NULL || left == NULL
11070 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
Victor Stinner23e56682011-10-03 03:54:37 +020011071 if (!PyErr_Occurred())
11072 PyErr_BadInternalCall();
11073 goto error;
11074 }
11075
Benjamin Petersonbac79492012-01-14 13:34:47 -050011076 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011077 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050011078 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011079 goto error;
11080
Victor Stinner488fa492011-12-12 00:01:39 +010011081 /* Shortcuts */
11082 if (left == unicode_empty) {
11083 Py_DECREF(left);
11084 Py_INCREF(right);
11085 *p_left = right;
11086 return;
11087 }
11088 if (right == unicode_empty)
11089 return;
11090
11091 left_len = PyUnicode_GET_LENGTH(left);
11092 right_len = PyUnicode_GET_LENGTH(right);
11093 if (left_len > PY_SSIZE_T_MAX - right_len) {
11094 PyErr_SetString(PyExc_OverflowError,
11095 "strings are too large to concat");
11096 goto error;
11097 }
11098 new_len = left_len + right_len;
11099
11100 if (unicode_modifiable(left)
11101 && PyUnicode_CheckExact(right)
11102 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020011103 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11104 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020011105 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020011106 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010011107 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11108 {
11109 /* append inplace */
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011110 if (unicode_resize(p_left, new_len) != 0)
Victor Stinner488fa492011-12-12 00:01:39 +010011111 goto error;
Victor Stinnerf0335102013-04-14 19:13:03 +020011112
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011113 /* copy 'right' into the newly allocated area of 'left' */
11114 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020011115 }
Victor Stinner488fa492011-12-12 00:01:39 +010011116 else {
11117 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11118 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011119 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020011120
Victor Stinner488fa492011-12-12 00:01:39 +010011121 /* Concat the two Unicode strings */
11122 res = PyUnicode_New(new_len, maxchar);
11123 if (res == NULL)
11124 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020011125 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11126 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010011127 Py_DECREF(left);
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011128 *p_left = res;
Victor Stinner488fa492011-12-12 00:01:39 +010011129 }
11130 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020011131 return;
11132
11133error:
Victor Stinner488fa492011-12-12 00:01:39 +010011134 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011135}
11136
11137void
11138PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11139{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011140 PyUnicode_Append(pleft, right);
11141 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011142}
11143
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011144PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011145 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011146\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011147Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011148string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011149interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011150
11151static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011152unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011153{
Antoine Pitrouf6d1f1f2015-05-19 21:04:33 +020011154 PyObject *substring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011155 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011156 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011157 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011158 int kind1, kind2, kind;
11159 void *buf1, *buf2;
11160 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011161
Jesus Ceaac451502011-04-20 17:09:23 +020011162 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
11163 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011164 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000011165
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011166 kind1 = PyUnicode_KIND(self);
11167 kind2 = PyUnicode_KIND(substring);
Christian Heimesd47802e2013-06-29 21:33:36 +020011168 if (kind2 > kind1) {
11169 Py_DECREF(substring);
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040011170 return PyLong_FromLong(0);
Christian Heimesd47802e2013-06-29 21:33:36 +020011171 }
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040011172 kind = kind1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011173 buf1 = PyUnicode_DATA(self);
11174 buf2 = PyUnicode_DATA(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011175 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010011176 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011177 if (!buf2) {
11178 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011179 return NULL;
11180 }
11181 len1 = PyUnicode_GET_LENGTH(self);
11182 len2 = PyUnicode_GET_LENGTH(substring);
11183
11184 ADJUST_INDICES(start, end, len1);
Benjamin Petersonead6b532011-12-20 17:23:42 -060011185 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011186 case PyUnicode_1BYTE_KIND:
11187 iresult = ucs1lib_count(
11188 ((Py_UCS1*)buf1) + start, end - start,
11189 buf2, len2, PY_SSIZE_T_MAX
11190 );
11191 break;
11192 case PyUnicode_2BYTE_KIND:
11193 iresult = ucs2lib_count(
11194 ((Py_UCS2*)buf1) + start, end - start,
11195 buf2, len2, PY_SSIZE_T_MAX
11196 );
11197 break;
11198 case PyUnicode_4BYTE_KIND:
11199 iresult = ucs4lib_count(
11200 ((Py_UCS4*)buf1) + start, end - start,
11201 buf2, len2, PY_SSIZE_T_MAX
11202 );
11203 break;
11204 default:
11205 assert(0); iresult = 0;
11206 }
11207
11208 result = PyLong_FromSsize_t(iresult);
11209
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011210 if (kind2 != kind)
11211 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011212
11213 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011214
Guido van Rossumd57fd912000-03-10 22:53:23 +000011215 return result;
11216}
11217
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011218PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000011219 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011220\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000011221Encode S using the codec registered for encoding. Default encoding\n\
11222is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000011223handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000011224a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
11225'xmlcharrefreplace' as well as any other name registered with\n\
11226codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011227
11228static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011229unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011230{
Benjamin Peterson308d6372009-09-18 21:42:35 +000011231 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000011232 char *encoding = NULL;
11233 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000011234
Benjamin Peterson308d6372009-09-18 21:42:35 +000011235 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
11236 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011237 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011238 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011239}
11240
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011241PyDoc_STRVAR(expandtabs__doc__,
Ezio Melotti745d54d2013-11-16 19:10:57 +020011242 "S.expandtabs(tabsize=8) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011243\n\
11244Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011245If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011246
11247static PyObject*
Ezio Melotti745d54d2013-11-16 19:10:57 +020011248unicode_expandtabs(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011249{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011250 Py_ssize_t i, j, line_pos, src_len, incr;
11251 Py_UCS4 ch;
11252 PyObject *u;
11253 void *src_data, *dest_data;
Ezio Melotti745d54d2013-11-16 19:10:57 +020011254 static char *kwlist[] = {"tabsize", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000011255 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011256 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011257 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011258
Ezio Melotti745d54d2013-11-16 19:10:57 +020011259 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:expandtabs",
11260 kwlist, &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000011261 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011262
Antoine Pitrou22425222011-10-04 19:10:51 +020011263 if (PyUnicode_READY(self) == -1)
11264 return NULL;
11265
Thomas Wouters7e474022000-07-16 12:04:32 +000011266 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011267 src_len = PyUnicode_GET_LENGTH(self);
11268 i = j = line_pos = 0;
11269 kind = PyUnicode_KIND(self);
11270 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011271 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011272 for (; i < src_len; i++) {
11273 ch = PyUnicode_READ(kind, src_data, i);
11274 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011275 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011276 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011277 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011278 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011279 goto overflow;
11280 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011281 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011282 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011283 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011284 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011285 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011286 goto overflow;
11287 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011288 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011289 if (ch == '\n' || ch == '\r')
11290 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011291 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011292 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011293 if (!found)
11294 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011295
Guido van Rossumd57fd912000-03-10 22:53:23 +000011296 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011297 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011298 if (!u)
11299 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011300 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011301
Antoine Pitroue71d5742011-10-04 15:55:09 +020011302 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011303
Antoine Pitroue71d5742011-10-04 15:55:09 +020011304 for (; i < src_len; i++) {
11305 ch = PyUnicode_READ(kind, src_data, i);
11306 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011307 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011308 incr = tabsize - (line_pos % tabsize);
11309 line_pos += incr;
Victor Stinnerda79e632012-02-22 13:37:04 +010011310 FILL(kind, dest_data, ' ', j, incr);
11311 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011312 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011313 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011314 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011315 line_pos++;
11316 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011317 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011318 if (ch == '\n' || ch == '\r')
11319 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011320 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011321 }
11322 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011323 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011324
Antoine Pitroue71d5742011-10-04 15:55:09 +020011325 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011326 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11327 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011328}
11329
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011330PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011331 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011332\n\
11333Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011334such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011335arguments start and end are interpreted as in slice notation.\n\
11336\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011337Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011338
11339static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011340unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011341{
Antoine Pitrouf6d1f1f2015-05-19 21:04:33 +020011342 PyObject *substring = NULL;
11343 Py_ssize_t start = 0;
11344 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011345 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011346
Jesus Ceaac451502011-04-20 17:09:23 +020011347 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
11348 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011349 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011350
Christian Heimesd47802e2013-06-29 21:33:36 +020011351 if (PyUnicode_READY(self) == -1) {
11352 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011353 return NULL;
Christian Heimesd47802e2013-06-29 21:33:36 +020011354 }
11355 if (PyUnicode_READY(substring) == -1) {
11356 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011357 return NULL;
Christian Heimesd47802e2013-06-29 21:33:36 +020011358 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011359
Victor Stinner7931d9a2011-11-04 00:22:48 +010011360 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011361
11362 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011363
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011364 if (result == -2)
11365 return NULL;
11366
Christian Heimes217cfd12007-12-02 14:31:20 +000011367 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011368}
11369
11370static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011371unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011372{
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011373 void *data;
11374 enum PyUnicode_Kind kind;
11375 Py_UCS4 ch;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011376
11377 if (!PyUnicode_Check(self) || PyUnicode_READY(self) == -1) {
11378 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011379 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011380 }
11381 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11382 PyErr_SetString(PyExc_IndexError, "string index out of range");
11383 return NULL;
11384 }
11385 kind = PyUnicode_KIND(self);
11386 data = PyUnicode_DATA(self);
11387 ch = PyUnicode_READ(kind, data, index);
Victor Stinner985a82a2014-01-03 12:53:47 +010011388 return unicode_char(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011389}
11390
Guido van Rossumc2504932007-09-18 19:42:40 +000011391/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011392 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011393static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011394unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011395{
Guido van Rossumc2504932007-09-18 19:42:40 +000011396 Py_ssize_t len;
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080011397 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000011398
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011399#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011400 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011401#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011402 if (_PyUnicode_HASH(self) != -1)
11403 return _PyUnicode_HASH(self);
11404 if (PyUnicode_READY(self) == -1)
11405 return -1;
11406 len = PyUnicode_GET_LENGTH(self);
Georg Brandl16fa2a12012-02-21 00:50:13 +010011407 /*
11408 We make the hash of the empty string be 0, rather than using
11409 (prefix ^ suffix), since this slightly obfuscates the hash secret
11410 */
11411 if (len == 0) {
11412 _PyUnicode_HASH(self) = 0;
11413 return 0;
11414 }
Christian Heimes985ecdc2013-11-20 11:46:18 +010011415 x = _Py_HashBytes(PyUnicode_DATA(self),
11416 PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011417 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011418 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011419}
11420
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011421PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011422 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011423\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011424Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011425
11426static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011427unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011428{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011429 Py_ssize_t result;
Antoine Pitrouf6d1f1f2015-05-19 21:04:33 +020011430 PyObject *substring = NULL;
11431 Py_ssize_t start = 0;
11432 Py_ssize_t end = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011433
Jesus Ceaac451502011-04-20 17:09:23 +020011434 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
11435 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011436 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011437
Christian Heimesd47a0452013-06-29 21:21:37 +020011438 if (PyUnicode_READY(self) == -1) {
11439 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011440 return NULL;
Christian Heimesd47a0452013-06-29 21:21:37 +020011441 }
11442 if (PyUnicode_READY(substring) == -1) {
11443 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011444 return NULL;
Christian Heimesd47a0452013-06-29 21:21:37 +020011445 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011446
Victor Stinner7931d9a2011-11-04 00:22:48 +010011447 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011448
11449 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011450
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011451 if (result == -2)
11452 return NULL;
11453
Guido van Rossumd57fd912000-03-10 22:53:23 +000011454 if (result < 0) {
11455 PyErr_SetString(PyExc_ValueError, "substring not found");
11456 return NULL;
11457 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011458
Christian Heimes217cfd12007-12-02 14:31:20 +000011459 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011460}
11461
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011462PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011463 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011464\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011465Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011466at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011467
11468static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011469unicode_islower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011470{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011471 Py_ssize_t i, length;
11472 int kind;
11473 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011474 int cased;
11475
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011476 if (PyUnicode_READY(self) == -1)
11477 return NULL;
11478 length = PyUnicode_GET_LENGTH(self);
11479 kind = PyUnicode_KIND(self);
11480 data = PyUnicode_DATA(self);
11481
Guido van Rossumd57fd912000-03-10 22:53:23 +000011482 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011483 if (length == 1)
11484 return PyBool_FromLong(
11485 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011486
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011487 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011488 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011489 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011490
Guido van Rossumd57fd912000-03-10 22:53:23 +000011491 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011492 for (i = 0; i < length; i++) {
11493 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011494
Benjamin Peterson29060642009-01-31 22:14:21 +000011495 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11496 return PyBool_FromLong(0);
11497 else if (!cased && Py_UNICODE_ISLOWER(ch))
11498 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011499 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011500 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011501}
11502
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011503PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011504 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011505\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011506Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011507at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011508
11509static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011510unicode_isupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011511{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011512 Py_ssize_t i, length;
11513 int kind;
11514 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011515 int cased;
11516
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011517 if (PyUnicode_READY(self) == -1)
11518 return NULL;
11519 length = PyUnicode_GET_LENGTH(self);
11520 kind = PyUnicode_KIND(self);
11521 data = PyUnicode_DATA(self);
11522
Guido van Rossumd57fd912000-03-10 22:53:23 +000011523 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011524 if (length == 1)
11525 return PyBool_FromLong(
11526 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011527
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011528 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011529 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011530 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011531
Guido van Rossumd57fd912000-03-10 22:53:23 +000011532 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011533 for (i = 0; i < length; i++) {
11534 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011535
Benjamin Peterson29060642009-01-31 22:14:21 +000011536 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11537 return PyBool_FromLong(0);
11538 else if (!cased && Py_UNICODE_ISUPPER(ch))
11539 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011540 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011541 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011542}
11543
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011544PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011545 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011546\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011547Return True if S is a titlecased string and there is at least one\n\
11548character in S, i.e. upper- and titlecase characters may only\n\
11549follow uncased characters and lowercase characters only cased ones.\n\
11550Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011551
11552static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011553unicode_istitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011554{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011555 Py_ssize_t i, length;
11556 int kind;
11557 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011558 int cased, previous_is_cased;
11559
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011560 if (PyUnicode_READY(self) == -1)
11561 return NULL;
11562 length = PyUnicode_GET_LENGTH(self);
11563 kind = PyUnicode_KIND(self);
11564 data = PyUnicode_DATA(self);
11565
Guido van Rossumd57fd912000-03-10 22:53:23 +000011566 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011567 if (length == 1) {
11568 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11569 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11570 (Py_UNICODE_ISUPPER(ch) != 0));
11571 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011572
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011573 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011574 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011575 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011576
Guido van Rossumd57fd912000-03-10 22:53:23 +000011577 cased = 0;
11578 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011579 for (i = 0; i < length; i++) {
11580 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011581
Benjamin Peterson29060642009-01-31 22:14:21 +000011582 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11583 if (previous_is_cased)
11584 return PyBool_FromLong(0);
11585 previous_is_cased = 1;
11586 cased = 1;
11587 }
11588 else if (Py_UNICODE_ISLOWER(ch)) {
11589 if (!previous_is_cased)
11590 return PyBool_FromLong(0);
11591 previous_is_cased = 1;
11592 cased = 1;
11593 }
11594 else
11595 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011596 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011597 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011598}
11599
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011600PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011601 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011602\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011603Return True if all characters in S are whitespace\n\
11604and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011605
11606static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011607unicode_isspace(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011608{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011609 Py_ssize_t i, length;
11610 int kind;
11611 void *data;
11612
11613 if (PyUnicode_READY(self) == -1)
11614 return NULL;
11615 length = PyUnicode_GET_LENGTH(self);
11616 kind = PyUnicode_KIND(self);
11617 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011618
Guido van Rossumd57fd912000-03-10 22:53:23 +000011619 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011620 if (length == 1)
11621 return PyBool_FromLong(
11622 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011623
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011624 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011625 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011626 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011627
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011628 for (i = 0; i < length; i++) {
11629 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011630 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011631 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011632 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011633 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011634}
11635
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011636PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011637 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011638\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011639Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011640and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011641
11642static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011643unicode_isalpha(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011644{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011645 Py_ssize_t i, length;
11646 int kind;
11647 void *data;
11648
11649 if (PyUnicode_READY(self) == -1)
11650 return NULL;
11651 length = PyUnicode_GET_LENGTH(self);
11652 kind = PyUnicode_KIND(self);
11653 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011654
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011655 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011656 if (length == 1)
11657 return PyBool_FromLong(
11658 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011659
11660 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011661 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011662 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011663
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011664 for (i = 0; i < length; i++) {
11665 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011666 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011667 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011668 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011669}
11670
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011671PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011672 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011673\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011674Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011675and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011676
11677static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011678unicode_isalnum(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011679{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011680 int kind;
11681 void *data;
11682 Py_ssize_t len, i;
11683
11684 if (PyUnicode_READY(self) == -1)
11685 return NULL;
11686
11687 kind = PyUnicode_KIND(self);
11688 data = PyUnicode_DATA(self);
11689 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011690
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011691 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011692 if (len == 1) {
11693 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11694 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11695 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011696
11697 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011698 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011699 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011700
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011701 for (i = 0; i < len; i++) {
11702 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011703 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011704 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011705 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011706 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011707}
11708
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011709PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011710 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011711\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011712Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011713False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011714
11715static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011716unicode_isdecimal(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011717{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011718 Py_ssize_t i, length;
11719 int kind;
11720 void *data;
11721
11722 if (PyUnicode_READY(self) == -1)
11723 return NULL;
11724 length = PyUnicode_GET_LENGTH(self);
11725 kind = PyUnicode_KIND(self);
11726 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011727
Guido van Rossumd57fd912000-03-10 22:53:23 +000011728 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011729 if (length == 1)
11730 return PyBool_FromLong(
11731 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011732
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011733 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011734 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011735 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011736
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011737 for (i = 0; i < length; i++) {
11738 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011739 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011740 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011741 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011742}
11743
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011744PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011745 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011746\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011747Return True if all characters in S are digits\n\
11748and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011749
11750static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011751unicode_isdigit(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011752{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011753 Py_ssize_t i, length;
11754 int kind;
11755 void *data;
11756
11757 if (PyUnicode_READY(self) == -1)
11758 return NULL;
11759 length = PyUnicode_GET_LENGTH(self);
11760 kind = PyUnicode_KIND(self);
11761 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011762
Guido van Rossumd57fd912000-03-10 22:53:23 +000011763 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011764 if (length == 1) {
11765 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11766 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11767 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011768
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011769 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011770 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011771 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011772
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011773 for (i = 0; i < length; i++) {
11774 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011775 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011776 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011777 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011778}
11779
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011780PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011781 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011782\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011783Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011784False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011785
11786static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011787unicode_isnumeric(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011788{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011789 Py_ssize_t i, length;
11790 int kind;
11791 void *data;
11792
11793 if (PyUnicode_READY(self) == -1)
11794 return NULL;
11795 length = PyUnicode_GET_LENGTH(self);
11796 kind = PyUnicode_KIND(self);
11797 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011798
Guido van Rossumd57fd912000-03-10 22:53:23 +000011799 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011800 if (length == 1)
11801 return PyBool_FromLong(
11802 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011803
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011804 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011805 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011806 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011807
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011808 for (i = 0; i < length; i++) {
11809 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011810 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011811 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011812 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011813}
11814
Martin v. Löwis47383402007-08-15 07:32:56 +000011815int
11816PyUnicode_IsIdentifier(PyObject *self)
11817{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011818 int kind;
11819 void *data;
11820 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011821 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000011822
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011823 if (PyUnicode_READY(self) == -1) {
11824 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000011825 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011826 }
11827
11828 /* Special case for empty strings */
11829 if (PyUnicode_GET_LENGTH(self) == 0)
11830 return 0;
11831 kind = PyUnicode_KIND(self);
11832 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000011833
11834 /* PEP 3131 says that the first character must be in
11835 XID_Start and subsequent characters in XID_Continue,
11836 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000011837 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000011838 letters, digits, underscore). However, given the current
11839 definition of XID_Start and XID_Continue, it is sufficient
11840 to check just for these, except that _ must be allowed
11841 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011842 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050011843 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000011844 return 0;
11845
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040011846 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011847 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011848 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000011849 return 1;
11850}
11851
11852PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011853 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000011854\n\
11855Return True if S is a valid identifier according\n\
Raymond Hettinger378170d2013-03-23 08:21:12 -070011856to the language definition.\n\
11857\n\
11858Use keyword.iskeyword() to test for reserved identifiers\n\
11859such as \"def\" and \"class\".\n");
Martin v. Löwis47383402007-08-15 07:32:56 +000011860
11861static PyObject*
11862unicode_isidentifier(PyObject *self)
11863{
11864 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11865}
11866
Georg Brandl559e5d72008-06-11 18:37:52 +000011867PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011868 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000011869\n\
11870Return True if all characters in S are considered\n\
11871printable in repr() or S is empty, False otherwise.");
11872
11873static PyObject*
11874unicode_isprintable(PyObject *self)
11875{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011876 Py_ssize_t i, length;
11877 int kind;
11878 void *data;
11879
11880 if (PyUnicode_READY(self) == -1)
11881 return NULL;
11882 length = PyUnicode_GET_LENGTH(self);
11883 kind = PyUnicode_KIND(self);
11884 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000011885
11886 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011887 if (length == 1)
11888 return PyBool_FromLong(
11889 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000011890
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011891 for (i = 0; i < length; i++) {
11892 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011893 Py_RETURN_FALSE;
11894 }
11895 }
11896 Py_RETURN_TRUE;
11897}
11898
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011899PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000011900 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011901\n\
11902Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000011903iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011904
11905static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011906unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011907{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011908 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011909}
11910
Martin v. Löwis18e16552006-02-15 17:27:45 +000011911static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011912unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011913{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011914 if (PyUnicode_READY(self) == -1)
11915 return -1;
11916 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011917}
11918
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011919PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011920 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011921\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011922Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011923done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011924
11925static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011926unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011927{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011928 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011929 Py_UCS4 fillchar = ' ';
11930
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011931 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011932 return NULL;
11933
Benjamin Petersonbac79492012-01-14 13:34:47 -050011934 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010011935 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011936
Victor Stinnerc4b49542011-12-11 22:44:26 +010011937 if (PyUnicode_GET_LENGTH(self) >= width)
11938 return unicode_result_unchanged(self);
11939
11940 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011941}
11942
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011943PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011944 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011945\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011946Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011947
11948static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011949unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011950{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050011951 if (PyUnicode_READY(self) == -1)
11952 return NULL;
11953 if (PyUnicode_IS_ASCII(self))
11954 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010011955 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011956}
11957
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011958#define LEFTSTRIP 0
11959#define RIGHTSTRIP 1
11960#define BOTHSTRIP 2
11961
11962/* Arrays indexed by above */
11963static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
11964
11965#define STRIPNAME(i) (stripformat[i]+3)
11966
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011967/* externally visible for str.strip(unicode) */
11968PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011969_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011970{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011971 void *data;
11972 int kind;
11973 Py_ssize_t i, j, len;
11974 BLOOM_MASK sepmask;
Victor Stinnerb3a60142013-04-09 22:19:21 +020011975 Py_ssize_t seplen;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011976
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011977 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
11978 return NULL;
11979
11980 kind = PyUnicode_KIND(self);
11981 data = PyUnicode_DATA(self);
11982 len = PyUnicode_GET_LENGTH(self);
Victor Stinnerb3a60142013-04-09 22:19:21 +020011983 seplen = PyUnicode_GET_LENGTH(sepobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011984 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11985 PyUnicode_DATA(sepobj),
Victor Stinnerb3a60142013-04-09 22:19:21 +020011986 seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011987
Benjamin Peterson14339b62009-01-31 16:36:08 +000011988 i = 0;
11989 if (striptype != RIGHTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020011990 while (i < len) {
11991 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11992 if (!BLOOM(sepmask, ch))
11993 break;
11994 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
11995 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000011996 i++;
11997 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011998 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011999
Benjamin Peterson14339b62009-01-31 16:36:08 +000012000 j = len;
12001 if (striptype != LEFTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012002 j--;
12003 while (j >= i) {
12004 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12005 if (!BLOOM(sepmask, ch))
12006 break;
12007 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12008 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012009 j--;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012010 }
12011
Benjamin Peterson29060642009-01-31 22:14:21 +000012012 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012013 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012014
Victor Stinner7931d9a2011-11-04 00:22:48 +010012015 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012016}
12017
12018PyObject*
12019PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12020{
12021 unsigned char *data;
12022 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020012023 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012024
Victor Stinnerde636f32011-10-01 03:55:54 +020012025 if (PyUnicode_READY(self) == -1)
12026 return NULL;
12027
Victor Stinner684d5fd2012-05-03 02:32:34 +020012028 length = PyUnicode_GET_LENGTH(self);
12029 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020012030
Victor Stinner684d5fd2012-05-03 02:32:34 +020012031 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012032 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012033
Victor Stinnerde636f32011-10-01 03:55:54 +020012034 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012035 PyErr_SetString(PyExc_IndexError, "string index out of range");
12036 return NULL;
12037 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020012038 if (start >= length || end < start)
12039 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner12bab6d2011-10-01 01:53:49 +020012040
Victor Stinner684d5fd2012-05-03 02:32:34 +020012041 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020012042 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020012043 data = PyUnicode_1BYTE_DATA(self);
Victor Stinnerd3f08822012-05-29 12:57:52 +020012044 return _PyUnicode_FromASCII((char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020012045 }
12046 else {
12047 kind = PyUnicode_KIND(self);
12048 data = PyUnicode_1BYTE_DATA(self);
12049 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012050 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020012051 length);
12052 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012053}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012054
12055static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012056do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012057{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012058 Py_ssize_t len, i, j;
12059
12060 if (PyUnicode_READY(self) == -1)
12061 return NULL;
12062
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012063 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012064
Victor Stinnercc7af722013-04-09 22:39:24 +020012065 if (PyUnicode_IS_ASCII(self)) {
12066 Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12067
12068 i = 0;
12069 if (striptype != RIGHTSTRIP) {
12070 while (i < len) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012071 Py_UCS1 ch = data[i];
Victor Stinnercc7af722013-04-09 22:39:24 +020012072 if (!_Py_ascii_whitespace[ch])
12073 break;
12074 i++;
12075 }
12076 }
12077
12078 j = len;
12079 if (striptype != LEFTSTRIP) {
12080 j--;
12081 while (j >= i) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012082 Py_UCS1 ch = data[j];
Victor Stinnercc7af722013-04-09 22:39:24 +020012083 if (!_Py_ascii_whitespace[ch])
12084 break;
12085 j--;
12086 }
12087 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012088 }
12089 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012090 else {
12091 int kind = PyUnicode_KIND(self);
12092 void *data = PyUnicode_DATA(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012093
Victor Stinnercc7af722013-04-09 22:39:24 +020012094 i = 0;
12095 if (striptype != RIGHTSTRIP) {
12096 while (i < len) {
12097 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12098 if (!Py_UNICODE_ISSPACE(ch))
12099 break;
12100 i++;
12101 }
Victor Stinner9c79e412013-04-09 22:21:08 +020012102 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012103
12104 j = len;
12105 if (striptype != LEFTSTRIP) {
12106 j--;
12107 while (j >= i) {
12108 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12109 if (!Py_UNICODE_ISSPACE(ch))
12110 break;
12111 j--;
12112 }
12113 j++;
12114 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012115 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012116
Victor Stinner7931d9a2011-11-04 00:22:48 +010012117 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012118}
12119
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012120
12121static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012122do_argstrip(PyObject *self, int striptype, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012123{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012124 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012125
Serhiy Storchakac6792272013-10-19 21:03:34 +030012126 if (!PyArg_ParseTuple(args, stripformat[striptype], &sep))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012127 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012128
Benjamin Peterson14339b62009-01-31 16:36:08 +000012129 if (sep != NULL && sep != Py_None) {
12130 if (PyUnicode_Check(sep))
12131 return _PyUnicode_XStrip(self, striptype, sep);
12132 else {
12133 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012134 "%s arg must be None or str",
12135 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012136 return NULL;
12137 }
12138 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012139
Benjamin Peterson14339b62009-01-31 16:36:08 +000012140 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012141}
12142
12143
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012144PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012145 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012146\n\
12147Return a copy of the string S with leading and trailing\n\
12148whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012149If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012150
12151static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012152unicode_strip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012153{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012154 if (PyTuple_GET_SIZE(args) == 0)
12155 return do_strip(self, BOTHSTRIP); /* Common case */
12156 else
12157 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012158}
12159
12160
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012161PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012162 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012163\n\
12164Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012165If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012166
12167static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012168unicode_lstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012169{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012170 if (PyTuple_GET_SIZE(args) == 0)
12171 return do_strip(self, LEFTSTRIP); /* Common case */
12172 else
12173 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012174}
12175
12176
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012177PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012178 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012179\n\
12180Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012181If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012182
12183static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012184unicode_rstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012185{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012186 if (PyTuple_GET_SIZE(args) == 0)
12187 return do_strip(self, RIGHTSTRIP); /* Common case */
12188 else
12189 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012190}
12191
12192
Guido van Rossumd57fd912000-03-10 22:53:23 +000012193static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012194unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012195{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012196 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012197 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012198
Serhiy Storchaka05997252013-01-26 12:14:02 +020012199 if (len < 1)
12200 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012201
Victor Stinnerc4b49542011-12-11 22:44:26 +010012202 /* no repeat, return original string */
12203 if (len == 1)
12204 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000012205
Benjamin Petersonbac79492012-01-14 13:34:47 -050012206 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012207 return NULL;
12208
Victor Stinnerc759f3e2011-10-01 03:09:58 +020012209 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020012210 PyErr_SetString(PyExc_OverflowError,
12211 "repeated string is too long");
12212 return NULL;
12213 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012214 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012215
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012216 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012217 if (!u)
12218 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012219 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012220
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012221 if (PyUnicode_GET_LENGTH(str) == 1) {
12222 const int kind = PyUnicode_KIND(str);
12223 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010012224 if (kind == PyUnicode_1BYTE_KIND) {
12225 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012226 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010012227 }
12228 else if (kind == PyUnicode_2BYTE_KIND) {
12229 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012230 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010012231 ucs2[n] = fill_char;
12232 } else {
12233 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12234 assert(kind == PyUnicode_4BYTE_KIND);
12235 for (n = 0; n < len; ++n)
12236 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012237 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012238 }
12239 else {
12240 /* number of characters copied this far */
12241 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012242 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012243 char *to = (char *) PyUnicode_DATA(u);
12244 Py_MEMCPY(to, PyUnicode_DATA(str),
12245 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000012246 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012247 n = (done <= nchars-done) ? done : nchars-done;
12248 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012249 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000012250 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012251 }
12252
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012253 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012254 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012255}
12256
Alexander Belopolsky40018472011-02-26 01:02:56 +000012257PyObject *
12258PyUnicode_Replace(PyObject *obj,
12259 PyObject *subobj,
12260 PyObject *replobj,
12261 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012262{
12263 PyObject *self;
12264 PyObject *str1;
12265 PyObject *str2;
12266 PyObject *result;
12267
12268 self = PyUnicode_FromObject(obj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012269 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012270 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012271 str1 = PyUnicode_FromObject(subobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012272 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012273 Py_DECREF(self);
12274 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012275 }
12276 str2 = PyUnicode_FromObject(replobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012277 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012278 Py_DECREF(self);
12279 Py_DECREF(str1);
12280 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012281 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060012282 if (PyUnicode_READY(self) == -1 ||
12283 PyUnicode_READY(str1) == -1 ||
12284 PyUnicode_READY(str2) == -1)
12285 result = NULL;
12286 else
12287 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012288 Py_DECREF(self);
12289 Py_DECREF(str1);
12290 Py_DECREF(str2);
12291 return result;
12292}
12293
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012294PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000012295 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012296\n\
12297Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000012298old replaced by new. If the optional argument count is\n\
12299given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012300
12301static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012302unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012303{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012304 PyObject *str1;
12305 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012306 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012307 PyObject *result;
12308
Martin v. Löwis18e16552006-02-15 17:27:45 +000012309 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012310 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060012311 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012312 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012313 str1 = PyUnicode_FromObject(str1);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012314 if (str1 == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012315 return NULL;
12316 str2 = PyUnicode_FromObject(str2);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012317 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012318 Py_DECREF(str1);
12319 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000012320 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060012321 if (PyUnicode_READY(str1) == -1 || PyUnicode_READY(str2) == -1)
12322 result = NULL;
12323 else
12324 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012325
12326 Py_DECREF(str1);
12327 Py_DECREF(str2);
12328 return result;
12329}
12330
Alexander Belopolsky40018472011-02-26 01:02:56 +000012331static PyObject *
12332unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012333{
Walter Dörwald79e913e2007-05-12 11:08:06 +000012334 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012335 Py_ssize_t isize;
12336 Py_ssize_t osize, squote, dquote, i, o;
12337 Py_UCS4 max, quote;
Victor Stinner55c08782013-04-14 18:45:39 +020012338 int ikind, okind, unchanged;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012339 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012340
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012341 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012342 return NULL;
12343
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012344 isize = PyUnicode_GET_LENGTH(unicode);
12345 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012346
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012347 /* Compute length of output, quote characters, and
12348 maximum character */
Victor Stinner55c08782013-04-14 18:45:39 +020012349 osize = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012350 max = 127;
12351 squote = dquote = 0;
12352 ikind = PyUnicode_KIND(unicode);
12353 for (i = 0; i < isize; i++) {
12354 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Benjamin Peterson736b8012014-09-29 23:02:15 -040012355 Py_ssize_t incr = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012356 switch (ch) {
Benjamin Peterson736b8012014-09-29 23:02:15 -040012357 case '\'': squote++; break;
12358 case '"': dquote++; break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012359 case '\\': case '\t': case '\r': case '\n':
Benjamin Peterson736b8012014-09-29 23:02:15 -040012360 incr = 2;
12361 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012362 default:
12363 /* Fast-path ASCII */
12364 if (ch < ' ' || ch == 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012365 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012366 else if (ch < 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012367 ;
12368 else if (Py_UNICODE_ISPRINTABLE(ch))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012369 max = ch > max ? ch : max;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012370 else if (ch < 0x100)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012371 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012372 else if (ch < 0x10000)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012373 incr = 6; /* \uHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012374 else
Benjamin Peterson736b8012014-09-29 23:02:15 -040012375 incr = 10; /* \uHHHHHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012376 }
Benjamin Peterson736b8012014-09-29 23:02:15 -040012377 if (osize > PY_SSIZE_T_MAX - incr) {
12378 PyErr_SetString(PyExc_OverflowError,
12379 "string is too long to generate repr");
12380 return NULL;
12381 }
12382 osize += incr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012383 }
12384
12385 quote = '\'';
Victor Stinner55c08782013-04-14 18:45:39 +020012386 unchanged = (osize == isize);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012387 if (squote) {
Victor Stinner55c08782013-04-14 18:45:39 +020012388 unchanged = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012389 if (dquote)
12390 /* Both squote and dquote present. Use squote,
12391 and escape them */
12392 osize += squote;
12393 else
12394 quote = '"';
12395 }
Victor Stinner55c08782013-04-14 18:45:39 +020012396 osize += 2; /* quotes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012397
12398 repr = PyUnicode_New(osize, max);
12399 if (repr == NULL)
12400 return NULL;
12401 okind = PyUnicode_KIND(repr);
12402 odata = PyUnicode_DATA(repr);
12403
12404 PyUnicode_WRITE(okind, odata, 0, quote);
12405 PyUnicode_WRITE(okind, odata, osize-1, quote);
Victor Stinner55c08782013-04-14 18:45:39 +020012406 if (unchanged) {
12407 _PyUnicode_FastCopyCharacters(repr, 1,
12408 unicode, 0,
12409 isize);
12410 }
12411 else {
12412 for (i = 0, o = 1; i < isize; i++) {
12413 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012414
Victor Stinner55c08782013-04-14 18:45:39 +020012415 /* Escape quotes and backslashes */
12416 if ((ch == quote) || (ch == '\\')) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012417 PyUnicode_WRITE(okind, odata, o++, '\\');
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012418 PyUnicode_WRITE(okind, odata, o++, ch);
Victor Stinner55c08782013-04-14 18:45:39 +020012419 continue;
12420 }
12421
12422 /* Map special whitespace to '\t', \n', '\r' */
12423 if (ch == '\t') {
12424 PyUnicode_WRITE(okind, odata, o++, '\\');
12425 PyUnicode_WRITE(okind, odata, o++, 't');
12426 }
12427 else if (ch == '\n') {
12428 PyUnicode_WRITE(okind, odata, o++, '\\');
12429 PyUnicode_WRITE(okind, odata, o++, 'n');
12430 }
12431 else if (ch == '\r') {
12432 PyUnicode_WRITE(okind, odata, o++, '\\');
12433 PyUnicode_WRITE(okind, odata, o++, 'r');
12434 }
12435
12436 /* Map non-printable US ASCII to '\xhh' */
12437 else if (ch < ' ' || ch == 0x7F) {
12438 PyUnicode_WRITE(okind, odata, o++, '\\');
12439 PyUnicode_WRITE(okind, odata, o++, 'x');
12440 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12441 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12442 }
12443
12444 /* Copy ASCII characters as-is */
12445 else if (ch < 0x7F) {
12446 PyUnicode_WRITE(okind, odata, o++, ch);
12447 }
12448
12449 /* Non-ASCII characters */
12450 else {
12451 /* Map Unicode whitespace and control characters
12452 (categories Z* and C* except ASCII space)
12453 */
12454 if (!Py_UNICODE_ISPRINTABLE(ch)) {
12455 PyUnicode_WRITE(okind, odata, o++, '\\');
12456 /* Map 8-bit characters to '\xhh' */
12457 if (ch <= 0xff) {
12458 PyUnicode_WRITE(okind, odata, o++, 'x');
12459 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12460 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12461 }
12462 /* Map 16-bit characters to '\uxxxx' */
12463 else if (ch <= 0xffff) {
12464 PyUnicode_WRITE(okind, odata, o++, 'u');
12465 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12466 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12467 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12468 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12469 }
12470 /* Map 21-bit characters to '\U00xxxxxx' */
12471 else {
12472 PyUnicode_WRITE(okind, odata, o++, 'U');
12473 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12474 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12475 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12476 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12477 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12478 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12479 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12480 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12481 }
12482 }
12483 /* Copy characters as-is */
12484 else {
12485 PyUnicode_WRITE(okind, odata, o++, ch);
12486 }
Georg Brandl559e5d72008-06-11 18:37:52 +000012487 }
12488 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012489 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012490 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012491 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012492 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012493}
12494
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012495PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012496 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012497\n\
12498Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012499such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012500arguments start and end are interpreted as in slice notation.\n\
12501\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012502Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012503
12504static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012505unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012506{
Antoine Pitrouf6d1f1f2015-05-19 21:04:33 +020012507 PyObject *substring = NULL;
12508 Py_ssize_t start = 0;
12509 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012510 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012511
Jesus Ceaac451502011-04-20 17:09:23 +020012512 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
12513 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012514 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012515
Christian Heimesea71a522013-06-29 21:17:34 +020012516 if (PyUnicode_READY(self) == -1) {
12517 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012518 return NULL;
Christian Heimesea71a522013-06-29 21:17:34 +020012519 }
12520 if (PyUnicode_READY(substring) == -1) {
12521 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012522 return NULL;
Christian Heimesea71a522013-06-29 21:17:34 +020012523 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012524
Victor Stinner7931d9a2011-11-04 00:22:48 +010012525 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012526
12527 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012528
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012529 if (result == -2)
12530 return NULL;
12531
Christian Heimes217cfd12007-12-02 14:31:20 +000012532 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012533}
12534
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012535PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012536 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012537\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012538Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012539
12540static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012541unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012542{
Antoine Pitrouf6d1f1f2015-05-19 21:04:33 +020012543 PyObject *substring = NULL;
12544 Py_ssize_t start = 0;
12545 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012546 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012547
Jesus Ceaac451502011-04-20 17:09:23 +020012548 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
12549 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012550 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012551
Christian Heimesea71a522013-06-29 21:17:34 +020012552 if (PyUnicode_READY(self) == -1) {
12553 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012554 return NULL;
Christian Heimesea71a522013-06-29 21:17:34 +020012555 }
12556 if (PyUnicode_READY(substring) == -1) {
12557 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012558 return NULL;
Christian Heimesea71a522013-06-29 21:17:34 +020012559 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012560
Victor Stinner7931d9a2011-11-04 00:22:48 +010012561 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012562
12563 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012564
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012565 if (result == -2)
12566 return NULL;
12567
Guido van Rossumd57fd912000-03-10 22:53:23 +000012568 if (result < 0) {
12569 PyErr_SetString(PyExc_ValueError, "substring not found");
12570 return NULL;
12571 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012572
Christian Heimes217cfd12007-12-02 14:31:20 +000012573 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012574}
12575
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012576PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012577 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012578\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012579Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012580done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012581
12582static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012583unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012584{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012585 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012586 Py_UCS4 fillchar = ' ';
12587
Victor Stinnere9a29352011-10-01 02:14:59 +020012588 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012589 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012590
Benjamin Petersonbac79492012-01-14 13:34:47 -050012591 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012592 return NULL;
12593
Victor Stinnerc4b49542011-12-11 22:44:26 +010012594 if (PyUnicode_GET_LENGTH(self) >= width)
12595 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012596
Victor Stinnerc4b49542011-12-11 22:44:26 +010012597 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012598}
12599
Alexander Belopolsky40018472011-02-26 01:02:56 +000012600PyObject *
12601PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012602{
12603 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000012604
Guido van Rossumd57fd912000-03-10 22:53:23 +000012605 s = PyUnicode_FromObject(s);
12606 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012607 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012608 if (sep != NULL) {
12609 sep = PyUnicode_FromObject(sep);
12610 if (sep == NULL) {
12611 Py_DECREF(s);
12612 return NULL;
12613 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012614 }
12615
Victor Stinner9310abb2011-10-05 00:59:23 +020012616 result = split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012617
12618 Py_DECREF(s);
12619 Py_XDECREF(sep);
12620 return result;
12621}
12622
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012623PyDoc_STRVAR(split__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012624 "S.split(sep=None, maxsplit=-1) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012625\n\
12626Return a list of the words in S, using sep as the\n\
12627delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000012628splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000012629whitespace string is a separator and empty strings are\n\
12630removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012631
12632static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012633unicode_split(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012634{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012635 static char *kwlist[] = {"sep", "maxsplit", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000012636 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012637 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012638
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012639 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:split",
12640 kwlist, &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012641 return NULL;
12642
12643 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012644 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012645 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012646 return split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012647 else
Victor Stinner7931d9a2011-11-04 00:22:48 +010012648 return PyUnicode_Split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012649}
12650
Thomas Wouters477c8d52006-05-27 19:21:47 +000012651PyObject *
12652PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
12653{
12654 PyObject* str_obj;
12655 PyObject* sep_obj;
12656 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012657 int kind1, kind2, kind;
12658 void *buf1 = NULL, *buf2 = NULL;
12659 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012660
12661 str_obj = PyUnicode_FromObject(str_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012662 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012663 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012664 sep_obj = PyUnicode_FromObject(sep_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012665 if (!sep_obj) {
12666 Py_DECREF(str_obj);
12667 return NULL;
12668 }
12669 if (PyUnicode_READY(sep_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
12670 Py_DECREF(sep_obj);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012671 Py_DECREF(str_obj);
12672 return NULL;
12673 }
12674
Victor Stinner14f8f022011-10-05 20:58:25 +020012675 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012676 kind2 = PyUnicode_KIND(sep_obj);
Victor Stinner14f8f022011-10-05 20:58:25 +020012677 kind = Py_MAX(kind1, kind2);
12678 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012679 if (kind1 != kind)
Victor Stinner14f8f022011-10-05 20:58:25 +020012680 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012681 if (!buf1)
12682 goto onError;
12683 buf2 = PyUnicode_DATA(sep_obj);
12684 if (kind2 != kind)
12685 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12686 if (!buf2)
12687 goto onError;
12688 len1 = PyUnicode_GET_LENGTH(str_obj);
12689 len2 = PyUnicode_GET_LENGTH(sep_obj);
12690
Serhiy Storchaka48070c12015-03-29 19:21:02 +030012691 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012692 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012693 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12694 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12695 else
12696 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012697 break;
12698 case PyUnicode_2BYTE_KIND:
12699 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12700 break;
12701 case PyUnicode_4BYTE_KIND:
12702 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12703 break;
12704 default:
12705 assert(0);
12706 out = 0;
12707 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012708
12709 Py_DECREF(sep_obj);
12710 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012711 if (kind1 != kind)
12712 PyMem_Free(buf1);
12713 if (kind2 != kind)
12714 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012715
12716 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012717 onError:
12718 Py_DECREF(sep_obj);
12719 Py_DECREF(str_obj);
12720 if (kind1 != kind && buf1)
12721 PyMem_Free(buf1);
12722 if (kind2 != kind && buf2)
12723 PyMem_Free(buf2);
12724 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012725}
12726
12727
12728PyObject *
12729PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
12730{
12731 PyObject* str_obj;
12732 PyObject* sep_obj;
12733 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012734 int kind1, kind2, kind;
12735 void *buf1 = NULL, *buf2 = NULL;
12736 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012737
12738 str_obj = PyUnicode_FromObject(str_in);
12739 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012740 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012741 sep_obj = PyUnicode_FromObject(sep_in);
12742 if (!sep_obj) {
12743 Py_DECREF(str_obj);
12744 return NULL;
12745 }
12746
Serhiy Storchaka48070c12015-03-29 19:21:02 +030012747 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012748 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020012749 kind = Py_MAX(kind1, kind2);
Serhiy Storchaka48070c12015-03-29 19:21:02 +030012750 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012751 if (kind1 != kind)
Serhiy Storchaka48070c12015-03-29 19:21:02 +030012752 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012753 if (!buf1)
12754 goto onError;
12755 buf2 = PyUnicode_DATA(sep_obj);
12756 if (kind2 != kind)
12757 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12758 if (!buf2)
12759 goto onError;
12760 len1 = PyUnicode_GET_LENGTH(str_obj);
12761 len2 = PyUnicode_GET_LENGTH(sep_obj);
12762
Serhiy Storchaka48070c12015-03-29 19:21:02 +030012763 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012764 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012765 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12766 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12767 else
12768 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012769 break;
12770 case PyUnicode_2BYTE_KIND:
12771 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12772 break;
12773 case PyUnicode_4BYTE_KIND:
12774 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12775 break;
12776 default:
12777 assert(0);
12778 out = 0;
12779 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012780
12781 Py_DECREF(sep_obj);
12782 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012783 if (kind1 != kind)
12784 PyMem_Free(buf1);
12785 if (kind2 != kind)
12786 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012787
12788 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012789 onError:
12790 Py_DECREF(sep_obj);
12791 Py_DECREF(str_obj);
12792 if (kind1 != kind && buf1)
12793 PyMem_Free(buf1);
12794 if (kind2 != kind && buf2)
12795 PyMem_Free(buf2);
12796 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012797}
12798
12799PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012800 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012801\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012802Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012803the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012804found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012805
12806static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012807unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012808{
Victor Stinner9310abb2011-10-05 00:59:23 +020012809 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012810}
12811
12812PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000012813 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012814\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012815Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012816the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012817separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012818
12819static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012820unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012821{
Victor Stinner9310abb2011-10-05 00:59:23 +020012822 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012823}
12824
Alexander Belopolsky40018472011-02-26 01:02:56 +000012825PyObject *
12826PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012827{
12828 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012829
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012830 s = PyUnicode_FromObject(s);
12831 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012832 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012833 if (sep != NULL) {
12834 sep = PyUnicode_FromObject(sep);
12835 if (sep == NULL) {
12836 Py_DECREF(s);
12837 return NULL;
12838 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012839 }
12840
Victor Stinner9310abb2011-10-05 00:59:23 +020012841 result = rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012842
12843 Py_DECREF(s);
12844 Py_XDECREF(sep);
12845 return result;
12846}
12847
12848PyDoc_STRVAR(rsplit__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012849 "S.rsplit(sep=None, maxsplit=-1) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012850\n\
12851Return a list of the words in S, using sep as the\n\
12852delimiter string, starting at the end of the string and\n\
12853working to the front. If maxsplit is given, at most maxsplit\n\
12854splits are done. If sep is not specified, any whitespace string\n\
12855is a separator.");
12856
12857static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012858unicode_rsplit(PyObject *self, PyObject *args, PyObject *kwds)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012859{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012860 static char *kwlist[] = {"sep", "maxsplit", 0};
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012861 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012862 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012863
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012864 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:rsplit",
12865 kwlist, &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012866 return NULL;
12867
12868 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012869 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012870 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012871 return rsplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012872 else
Victor Stinner9310abb2011-10-05 00:59:23 +020012873 return PyUnicode_RSplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012874}
12875
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012876PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012877 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012878\n\
12879Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000012880Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012881is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012882
12883static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012884unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012885{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012886 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000012887 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012888
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012889 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12890 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012891 return NULL;
12892
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012893 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012894}
12895
12896static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000012897PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012898{
Victor Stinnerc4b49542011-12-11 22:44:26 +010012899 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012900}
12901
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012902PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012903 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012904\n\
12905Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012906and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012907
12908static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012909unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012910{
Benjamin Petersoneea48462012-01-16 14:28:50 -050012911 if (PyUnicode_READY(self) == -1)
12912 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012913 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012914}
12915
Larry Hastings61272b72014-01-07 12:41:53 -080012916/*[clinic input]
Georg Brandlceee0772007-11-27 23:48:05 +000012917
Larry Hastings31826802013-10-19 00:09:25 -070012918@staticmethod
12919str.maketrans as unicode_maketrans
12920
12921 x: object
12922
12923 y: unicode=NULL
12924
12925 z: unicode=NULL
12926
12927 /
12928
12929Return a translation table usable for str.translate().
12930
12931If there is only one argument, it must be a dictionary mapping Unicode
12932ordinals (integers) or characters to Unicode ordinals, strings or None.
12933Character keys will be then converted to ordinals.
12934If there are two arguments, they must be strings of equal length, and
12935in the resulting dictionary, each character in x will be mapped to the
12936character at the same position in y. If there is a third argument, it
12937must be a string, whose characters will be mapped to None in the result.
Larry Hastings61272b72014-01-07 12:41:53 -080012938[clinic start generated code]*/
Larry Hastings31826802013-10-19 00:09:25 -070012939
12940PyDoc_STRVAR(unicode_maketrans__doc__,
Larry Hastings2623c8c2014-02-08 22:15:29 -080012941"maketrans(x, y=None, z=None, /)\n"
12942"--\n"
12943"\n"
Larry Hastings31826802013-10-19 00:09:25 -070012944"Return a translation table usable for str.translate().\n"
12945"\n"
Larry Hastings31826802013-10-19 00:09:25 -070012946"If there is only one argument, it must be a dictionary mapping Unicode\n"
12947"ordinals (integers) or characters to Unicode ordinals, strings or None.\n"
12948"Character keys will be then converted to ordinals.\n"
12949"If there are two arguments, they must be strings of equal length, and\n"
12950"in the resulting dictionary, each character in x will be mapped to the\n"
12951"character at the same position in y. If there is a third argument, it\n"
12952"must be a string, whose characters will be mapped to None in the result.");
12953
12954#define UNICODE_MAKETRANS_METHODDEF \
12955 {"maketrans", (PyCFunction)unicode_maketrans, METH_VARARGS|METH_STATIC, unicode_maketrans__doc__},
12956
12957static PyObject *
Larry Hastings5c661892014-01-24 06:17:25 -080012958unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z);
Larry Hastings31826802013-10-19 00:09:25 -070012959
12960static PyObject *
Larry Hastingsebdcb502013-11-23 14:54:00 -080012961unicode_maketrans(void *null, PyObject *args)
Georg Brandlceee0772007-11-27 23:48:05 +000012962{
Larry Hastings31826802013-10-19 00:09:25 -070012963 PyObject *return_value = NULL;
12964 PyObject *x;
12965 PyObject *y = NULL;
12966 PyObject *z = NULL;
12967
12968 if (!PyArg_ParseTuple(args,
12969 "O|UU:maketrans",
12970 &x, &y, &z))
12971 goto exit;
Larry Hastings5c661892014-01-24 06:17:25 -080012972 return_value = unicode_maketrans_impl(x, y, z);
Larry Hastings31826802013-10-19 00:09:25 -070012973
12974exit:
12975 return return_value;
12976}
12977
12978static PyObject *
Larry Hastings5c661892014-01-24 06:17:25 -080012979unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
Larry Hastings2623c8c2014-02-08 22:15:29 -080012980/*[clinic end generated code: output=566edf630f77436a input=7bfbf529a293c6c5]*/
Larry Hastings31826802013-10-19 00:09:25 -070012981{
Georg Brandlceee0772007-11-27 23:48:05 +000012982 PyObject *new = NULL, *key, *value;
12983 Py_ssize_t i = 0;
12984 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012985
Georg Brandlceee0772007-11-27 23:48:05 +000012986 new = PyDict_New();
12987 if (!new)
12988 return NULL;
12989 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012990 int x_kind, y_kind, z_kind;
12991 void *x_data, *y_data, *z_data;
12992
Georg Brandlceee0772007-11-27 23:48:05 +000012993 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000012994 if (!PyUnicode_Check(x)) {
12995 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12996 "be a string if there is a second argument");
12997 goto err;
12998 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012999 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013000 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13001 "arguments must have equal length");
13002 goto err;
13003 }
13004 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013005 x_kind = PyUnicode_KIND(x);
13006 y_kind = PyUnicode_KIND(y);
13007 x_data = PyUnicode_DATA(x);
13008 y_data = PyUnicode_DATA(y);
13009 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13010 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013011 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000013012 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060013013 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013014 if (!value) {
13015 Py_DECREF(key);
13016 goto err;
13017 }
Georg Brandlceee0772007-11-27 23:48:05 +000013018 res = PyDict_SetItem(new, key, value);
13019 Py_DECREF(key);
13020 Py_DECREF(value);
13021 if (res < 0)
13022 goto err;
13023 }
13024 /* create entries for deleting chars in z */
13025 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013026 z_kind = PyUnicode_KIND(z);
13027 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013028 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013029 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000013030 if (!key)
13031 goto err;
13032 res = PyDict_SetItem(new, key, Py_None);
13033 Py_DECREF(key);
13034 if (res < 0)
13035 goto err;
13036 }
13037 }
13038 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013039 int kind;
13040 void *data;
13041
Georg Brandlceee0772007-11-27 23:48:05 +000013042 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000013043 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013044 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13045 "to maketrans it must be a dict");
13046 goto err;
13047 }
13048 /* copy entries into the new dict, converting string keys to int keys */
13049 while (PyDict_Next(x, &i, &key, &value)) {
13050 if (PyUnicode_Check(key)) {
13051 /* convert string keys to integer keys */
13052 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013053 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000013054 PyErr_SetString(PyExc_ValueError, "string keys in translate "
13055 "table must be of length 1");
13056 goto err;
13057 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013058 kind = PyUnicode_KIND(key);
13059 data = PyUnicode_DATA(key);
13060 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000013061 if (!newkey)
13062 goto err;
13063 res = PyDict_SetItem(new, newkey, value);
13064 Py_DECREF(newkey);
13065 if (res < 0)
13066 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000013067 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013068 /* just keep integer keys */
13069 if (PyDict_SetItem(new, key, value) < 0)
13070 goto err;
13071 } else {
13072 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13073 "be strings or integers");
13074 goto err;
13075 }
13076 }
13077 }
13078 return new;
13079 err:
13080 Py_DECREF(new);
13081 return NULL;
13082}
13083
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013084PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013085 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013086\n\
Zachary Ware79b98df2015-08-05 23:54:15 -050013087Return a copy of the string S in which each character has been mapped\n\
13088through the given translation table. The table must implement\n\
13089lookup/indexing via __getitem__, for instance a dictionary or list,\n\
13090mapping Unicode ordinals to Unicode ordinals, strings, or None. If\n\
13091this operation raises LookupError, the character is left untouched.\n\
13092Characters mapped to None are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013093
13094static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013095unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013096{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013097 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013098}
13099
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013100PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013101 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013102\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013103Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013104
13105static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020013106unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013107{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050013108 if (PyUnicode_READY(self) == -1)
13109 return NULL;
13110 if (PyUnicode_IS_ASCII(self))
13111 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013112 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013113}
13114
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013115PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013116 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013117\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000013118Pad a numeric string S with zeros on the left, to fill a field\n\
13119of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013120
13121static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020013122unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013123{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013124 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020013125 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013126 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013127 int kind;
13128 void *data;
13129 Py_UCS4 chr;
13130
Martin v. Löwis18e16552006-02-15 17:27:45 +000013131 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000013132 return NULL;
13133
Benjamin Petersonbac79492012-01-14 13:34:47 -050013134 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010013135 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013136
Victor Stinnerc4b49542011-12-11 22:44:26 +010013137 if (PyUnicode_GET_LENGTH(self) >= width)
13138 return unicode_result_unchanged(self);
13139
13140 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013141
13142 u = pad(self, fill, 0, '0');
13143
Walter Dörwald068325e2002-04-15 13:36:47 +000013144 if (u == NULL)
13145 return NULL;
13146
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013147 kind = PyUnicode_KIND(u);
13148 data = PyUnicode_DATA(u);
13149 chr = PyUnicode_READ(kind, data, fill);
13150
13151 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000013152 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013153 PyUnicode_WRITE(kind, data, 0, chr);
13154 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000013155 }
13156
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013157 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010013158 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013159}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013160
13161#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013162static PyObject *
13163unicode__decimal2ascii(PyObject *self)
13164{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013165 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013166}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013167#endif
13168
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013169PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013170 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013171\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013172Return True if S starts with the specified prefix, False otherwise.\n\
13173With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013174With optional end, stop comparing S at that position.\n\
13175prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013176
13177static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013178unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013179 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013180{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013181 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013182 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013183 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013184 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013185 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013186
Jesus Ceaac451502011-04-20 17:09:23 +020013187 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013188 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013189 if (PyTuple_Check(subobj)) {
13190 Py_ssize_t i;
13191 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013192 substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013193 if (substring == NULL)
13194 return NULL;
13195 result = tailmatch(self, substring, start, end, -1);
13196 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010013197 if (result == -1)
13198 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013199 if (result) {
13200 Py_RETURN_TRUE;
13201 }
13202 }
13203 /* nothing matched */
13204 Py_RETURN_FALSE;
13205 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013206 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030013207 if (substring == NULL) {
13208 if (PyErr_ExceptionMatches(PyExc_TypeError))
13209 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
13210 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013211 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013212 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013213 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013214 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010013215 if (result == -1)
13216 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013217 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013218}
13219
13220
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013221PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013222 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013223\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013224Return True if S ends with the specified suffix, False otherwise.\n\
13225With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013226With optional end, stop comparing S at that position.\n\
13227suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013228
13229static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013230unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013231 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013232{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013233 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013234 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013235 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013236 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013237 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013238
Jesus Ceaac451502011-04-20 17:09:23 +020013239 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013240 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013241 if (PyTuple_Check(subobj)) {
13242 Py_ssize_t i;
13243 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013244 substring = PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000013245 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013246 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000013247 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013248 result = tailmatch(self, substring, start, end, +1);
13249 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010013250 if (result == -1)
13251 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013252 if (result) {
13253 Py_RETURN_TRUE;
13254 }
13255 }
13256 Py_RETURN_FALSE;
13257 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013258 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030013259 if (substring == NULL) {
13260 if (PyErr_ExceptionMatches(PyExc_TypeError))
13261 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
13262 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013263 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013264 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013265 result = tailmatch(self, substring, start, end, +1);
Christian Heimes305e49e2013-06-29 20:41:06 +020013266 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010013267 if (result == -1)
13268 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013269 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013270}
13271
Victor Stinner202fdca2012-05-07 12:47:02 +020013272Py_LOCAL_INLINE(void)
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013273_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013274{
Victor Stinner8f674cc2013-04-17 23:02:17 +020013275 if (!writer->readonly)
13276 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
13277 else {
13278 /* Copy-on-write mode: set buffer size to 0 so
13279 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13280 * next write. */
13281 writer->size = 0;
13282 }
Victor Stinner202fdca2012-05-07 12:47:02 +020013283 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13284 writer->data = PyUnicode_DATA(writer->buffer);
13285 writer->kind = PyUnicode_KIND(writer->buffer);
13286}
13287
Victor Stinnerd3f08822012-05-29 12:57:52 +020013288void
Victor Stinner8f674cc2013-04-17 23:02:17 +020013289_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013290{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013291 memset(writer, 0, sizeof(*writer));
13292#ifdef Py_DEBUG
13293 writer->kind = 5; /* invalid kind */
13294#endif
Victor Stinner8f674cc2013-04-17 23:02:17 +020013295 writer->min_char = 127;
Victor Stinner202fdca2012-05-07 12:47:02 +020013296}
13297
Victor Stinnerd3f08822012-05-29 12:57:52 +020013298int
13299_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13300 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020013301{
Victor Stinner6989ba02013-11-18 21:08:39 +010013302#ifdef MS_WINDOWS
13303 /* On Windows, overallocate by 50% is the best factor */
13304# define OVERALLOCATE_FACTOR 2
13305#else
13306 /* On Linux, overallocate by 25% is the best factor */
13307# define OVERALLOCATE_FACTOR 4
13308#endif
Victor Stinner202fdca2012-05-07 12:47:02 +020013309 Py_ssize_t newlen;
13310 PyObject *newbuffer;
13311
Victor Stinnerd3f08822012-05-29 12:57:52 +020013312 assert(length > 0);
13313
Victor Stinner202fdca2012-05-07 12:47:02 +020013314 if (length > PY_SSIZE_T_MAX - writer->pos) {
13315 PyErr_NoMemory();
13316 return -1;
13317 }
13318 newlen = writer->pos + length;
13319
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070013320 maxchar = Py_MAX(maxchar, writer->min_char);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013321
Victor Stinnerd3f08822012-05-29 12:57:52 +020013322 if (writer->buffer == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020013323 assert(!writer->readonly);
Victor Stinner6989ba02013-11-18 21:08:39 +010013324 if (writer->overallocate
13325 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13326 /* overallocate to limit the number of realloc() */
13327 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013328 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013329 if (newlen < writer->min_length)
13330 newlen = writer->min_length;
13331
Victor Stinnerd3f08822012-05-29 12:57:52 +020013332 writer->buffer = PyUnicode_New(newlen, maxchar);
13333 if (writer->buffer == NULL)
13334 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013335 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013336 else if (newlen > writer->size) {
Victor Stinner6989ba02013-11-18 21:08:39 +010013337 if (writer->overallocate
13338 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13339 /* overallocate to limit the number of realloc() */
13340 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013341 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013342 if (newlen < writer->min_length)
13343 newlen = writer->min_length;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013344
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013345 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020013346 /* resize + widen */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +030013347 maxchar = Py_MAX(maxchar, writer->maxchar);
Victor Stinner202fdca2012-05-07 12:47:02 +020013348 newbuffer = PyUnicode_New(newlen, maxchar);
13349 if (newbuffer == NULL)
13350 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013351 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13352 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020013353 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013354 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020013355 }
13356 else {
13357 newbuffer = resize_compact(writer->buffer, newlen);
13358 if (newbuffer == NULL)
13359 return -1;
13360 }
13361 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013362 }
13363 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013364 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013365 newbuffer = PyUnicode_New(writer->size, maxchar);
13366 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020013367 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013368 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13369 writer->buffer, 0, writer->pos);
13370 Py_DECREF(writer->buffer);
13371 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013372 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013373 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013374 return 0;
Victor Stinner6989ba02013-11-18 21:08:39 +010013375
13376#undef OVERALLOCATE_FACTOR
Victor Stinner202fdca2012-05-07 12:47:02 +020013377}
13378
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013379Py_LOCAL_INLINE(int)
13380_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
Victor Stinnera0dd0212013-04-11 22:09:04 +020013381{
13382 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13383 return -1;
13384 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13385 writer->pos++;
13386 return 0;
13387}
13388
13389int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013390_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13391{
13392 return _PyUnicodeWriter_WriteCharInline(writer, ch);
13393}
13394
13395int
Victor Stinnerd3f08822012-05-29 12:57:52 +020013396_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13397{
13398 Py_UCS4 maxchar;
13399 Py_ssize_t len;
13400
13401 if (PyUnicode_READY(str) == -1)
13402 return -1;
13403 len = PyUnicode_GET_LENGTH(str);
13404 if (len == 0)
13405 return 0;
13406 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13407 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013408 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020013409 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013410 Py_INCREF(str);
13411 writer->buffer = str;
13412 _PyUnicodeWriter_Update(writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013413 writer->pos += len;
13414 return 0;
13415 }
13416 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13417 return -1;
13418 }
13419 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13420 str, 0, len);
13421 writer->pos += len;
13422 return 0;
13423}
13424
Victor Stinnere215d962012-10-06 23:03:36 +020013425int
Victor Stinnercfc4c132013-04-03 01:48:39 +020013426_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13427 Py_ssize_t start, Py_ssize_t end)
13428{
13429 Py_UCS4 maxchar;
13430 Py_ssize_t len;
13431
13432 if (PyUnicode_READY(str) == -1)
13433 return -1;
13434
13435 assert(0 <= start);
13436 assert(end <= PyUnicode_GET_LENGTH(str));
13437 assert(start <= end);
13438
13439 if (end == 0)
13440 return 0;
13441
13442 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
13443 return _PyUnicodeWriter_WriteStr(writer, str);
13444
13445 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
13446 maxchar = _PyUnicode_FindMaxChar(str, start, end);
13447 else
13448 maxchar = writer->maxchar;
13449 len = end - start;
13450
13451 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
13452 return -1;
13453
13454 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13455 str, start, len);
13456 writer->pos += len;
13457 return 0;
13458}
13459
13460int
Victor Stinner4a587072013-11-19 12:54:53 +010013461_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
13462 const char *ascii, Py_ssize_t len)
13463{
13464 if (len == -1)
13465 len = strlen(ascii);
13466
13467 assert(ucs1lib_find_max_char((Py_UCS1*)ascii, (Py_UCS1*)ascii + len) < 128);
13468
13469 if (writer->buffer == NULL && !writer->overallocate) {
13470 PyObject *str;
13471
13472 str = _PyUnicode_FromASCII(ascii, len);
13473 if (str == NULL)
13474 return -1;
13475
13476 writer->readonly = 1;
13477 writer->buffer = str;
13478 _PyUnicodeWriter_Update(writer);
13479 writer->pos += len;
13480 return 0;
13481 }
13482
13483 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
13484 return -1;
13485
13486 switch (writer->kind)
13487 {
13488 case PyUnicode_1BYTE_KIND:
13489 {
13490 const Py_UCS1 *str = (const Py_UCS1 *)ascii;
13491 Py_UCS1 *data = writer->data;
13492
13493 Py_MEMCPY(data + writer->pos, str, len);
13494 break;
13495 }
13496 case PyUnicode_2BYTE_KIND:
13497 {
13498 _PyUnicode_CONVERT_BYTES(
13499 Py_UCS1, Py_UCS2,
13500 ascii, ascii + len,
13501 (Py_UCS2 *)writer->data + writer->pos);
13502 break;
13503 }
13504 case PyUnicode_4BYTE_KIND:
13505 {
13506 _PyUnicode_CONVERT_BYTES(
13507 Py_UCS1, Py_UCS4,
13508 ascii, ascii + len,
13509 (Py_UCS4 *)writer->data + writer->pos);
13510 break;
13511 }
13512 default:
13513 assert(0);
13514 }
13515
13516 writer->pos += len;
13517 return 0;
13518}
13519
13520int
13521_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
13522 const char *str, Py_ssize_t len)
Victor Stinnere215d962012-10-06 23:03:36 +020013523{
13524 Py_UCS4 maxchar;
13525
13526 maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len);
13527 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
13528 return -1;
13529 unicode_write_cstr(writer->buffer, writer->pos, str, len);
13530 writer->pos += len;
13531 return 0;
13532}
13533
Victor Stinnerd3f08822012-05-29 12:57:52 +020013534PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013535_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013536{
Victor Stinner15a0bd32013-07-08 22:29:55 +020013537 PyObject *str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013538 if (writer->pos == 0) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013539 Py_CLEAR(writer->buffer);
Serhiy Storchaka678db842013-01-26 12:16:36 +020013540 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3f08822012-05-29 12:57:52 +020013541 }
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013542 if (writer->readonly) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013543 str = writer->buffer;
13544 writer->buffer = NULL;
13545 assert(PyUnicode_GET_LENGTH(str) == writer->pos);
13546 return str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013547 }
13548 if (PyUnicode_GET_LENGTH(writer->buffer) != writer->pos) {
13549 PyObject *newbuffer;
13550 newbuffer = resize_compact(writer->buffer, writer->pos);
13551 if (newbuffer == NULL) {
Serhiy Storchakadfe98a12014-02-09 13:46:20 +020013552 Py_CLEAR(writer->buffer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013553 return NULL;
13554 }
13555 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013556 }
Victor Stinner15a0bd32013-07-08 22:29:55 +020013557 str = writer->buffer;
13558 writer->buffer = NULL;
13559 assert(_PyUnicode_CheckConsistency(str, 1));
13560 return unicode_result_ready(str);
Victor Stinner202fdca2012-05-07 12:47:02 +020013561}
13562
Victor Stinnerd3f08822012-05-29 12:57:52 +020013563void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013564_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013565{
13566 Py_CLEAR(writer->buffer);
13567}
13568
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013569#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000013570
13571PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013572 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013573\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013574Return a formatted version of S, using substitutions from args and kwargs.\n\
13575The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000013576
Eric Smith27bbca62010-11-04 17:06:58 +000013577PyDoc_STRVAR(format_map__doc__,
13578 "S.format_map(mapping) -> str\n\
13579\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013580Return a formatted version of S, using substitutions from mapping.\n\
13581The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000013582
Eric Smith4a7d76d2008-05-30 18:10:19 +000013583static PyObject *
13584unicode__format__(PyObject* self, PyObject* args)
13585{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013586 PyObject *format_spec;
13587 _PyUnicodeWriter writer;
13588 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000013589
13590 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
13591 return NULL;
13592
Victor Stinnerd3f08822012-05-29 12:57:52 +020013593 if (PyUnicode_READY(self) == -1)
13594 return NULL;
Victor Stinner8f674cc2013-04-17 23:02:17 +020013595 _PyUnicodeWriter_Init(&writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013596 ret = _PyUnicode_FormatAdvancedWriter(&writer,
13597 self, format_spec, 0,
13598 PyUnicode_GET_LENGTH(format_spec));
13599 if (ret == -1) {
13600 _PyUnicodeWriter_Dealloc(&writer);
13601 return NULL;
13602 }
13603 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000013604}
13605
Eric Smith8c663262007-08-25 02:26:07 +000013606PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013607 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013608\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013609Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000013610
13611static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013612unicode__sizeof__(PyObject *v)
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013613{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013614 Py_ssize_t size;
13615
13616 /* If it's a compact object, account for base structure +
13617 character data. */
13618 if (PyUnicode_IS_COMPACT_ASCII(v))
13619 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
13620 else if (PyUnicode_IS_COMPACT(v))
13621 size = sizeof(PyCompactUnicodeObject) +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013622 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013623 else {
13624 /* If it is a two-block object, account for base object, and
13625 for character block if present. */
13626 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020013627 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013628 size += (PyUnicode_GET_LENGTH(v) + 1) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013629 PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013630 }
13631 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020013632 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020013633 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013634 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020013635 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020013636 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013637
13638 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013639}
13640
13641PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013642 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013643
13644static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020013645unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013646{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010013647 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013648 if (!copy)
13649 return NULL;
13650 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013651}
13652
Guido van Rossumd57fd912000-03-10 22:53:23 +000013653static PyMethodDef unicode_methods[] = {
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000013654 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013655 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
Ezio Melotticda6b6d2012-02-26 09:39:55 +020013656 {"split", (PyCFunction) unicode_split, METH_VARARGS | METH_KEYWORDS, split__doc__},
13657 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS | METH_KEYWORDS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013658 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
13659 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
Benjamin Petersond5890c82012-01-14 13:23:30 -050013660 {"casefold", (PyCFunction) unicode_casefold, METH_NOARGS, casefold__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013661 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
13662 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
13663 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
Ezio Melotti745d54d2013-11-16 19:10:57 +020013664 {"expandtabs", (PyCFunction) unicode_expandtabs,
13665 METH_VARARGS | METH_KEYWORDS, expandtabs__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013666 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013667 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013668 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
13669 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
13670 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013671 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013672 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13673 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
13674 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013675 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013676 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Ezio Melotti745d54d2013-11-16 19:10:57 +020013677 {"splitlines", (PyCFunction) unicode_splitlines,
13678 METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013679 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013680 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
13681 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
13682 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
13683 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13684 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
13685 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
13686 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
13687 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
13688 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
13689 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
13690 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
13691 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
13692 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
13693 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000013694 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000013695 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013696 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000013697 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000013698 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000013699 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Larry Hastings31826802013-10-19 00:09:25 -070013700 UNICODE_MAKETRANS_METHODDEF
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013701 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000013702#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013703 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013704 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013705#endif
13706
Benjamin Peterson14339b62009-01-31 16:36:08 +000013707 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013708 {NULL, NULL}
13709};
13710
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013711static PyObject *
13712unicode_mod(PyObject *v, PyObject *w)
13713{
Brian Curtindfc80e32011-08-10 20:28:54 -050013714 if (!PyUnicode_Check(v))
13715 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000013716 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013717}
13718
13719static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013720 0, /*nb_add*/
13721 0, /*nb_subtract*/
13722 0, /*nb_multiply*/
13723 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013724};
13725
Guido van Rossumd57fd912000-03-10 22:53:23 +000013726static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013727 (lenfunc) unicode_length, /* sq_length */
13728 PyUnicode_Concat, /* sq_concat */
13729 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13730 (ssizeargfunc) unicode_getitem, /* sq_item */
13731 0, /* sq_slice */
13732 0, /* sq_ass_item */
13733 0, /* sq_ass_slice */
13734 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013735};
13736
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013737static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013738unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013739{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013740 if (PyUnicode_READY(self) == -1)
13741 return NULL;
13742
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013743 if (PyIndex_Check(item)) {
13744 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013745 if (i == -1 && PyErr_Occurred())
13746 return NULL;
13747 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013748 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013749 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013750 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000013751 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013752 PyObject *result;
13753 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013754 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013755 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013756
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013757 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000013758 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013759 return NULL;
13760 }
13761
13762 if (slicelength <= 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020013763 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013764 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010013765 slicelength == PyUnicode_GET_LENGTH(self)) {
13766 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000013767 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013768 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013769 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013770 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013771 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013772 src_kind = PyUnicode_KIND(self);
13773 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020013774 if (!PyUnicode_IS_ASCII(self)) {
13775 kind_limit = kind_maxchar_limit(src_kind);
13776 max_char = 0;
13777 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13778 ch = PyUnicode_READ(src_kind, src_data, cur);
13779 if (ch > max_char) {
13780 max_char = ch;
13781 if (max_char >= kind_limit)
13782 break;
13783 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013784 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013785 }
Victor Stinner55c99112011-10-13 01:17:06 +020013786 else
13787 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013788 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013789 if (result == NULL)
13790 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013791 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013792 dest_data = PyUnicode_DATA(result);
13793
13794 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013795 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13796 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013797 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013798 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013799 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013800 } else {
13801 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13802 return NULL;
13803 }
13804}
13805
13806static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013807 (lenfunc)unicode_length, /* mp_length */
13808 (binaryfunc)unicode_subscript, /* mp_subscript */
13809 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013810};
13811
Guido van Rossumd57fd912000-03-10 22:53:23 +000013812
Guido van Rossumd57fd912000-03-10 22:53:23 +000013813/* Helpers for PyUnicode_Format() */
13814
Victor Stinnera47082312012-10-04 02:19:54 +020013815struct unicode_formatter_t {
13816 PyObject *args;
13817 int args_owned;
13818 Py_ssize_t arglen, argidx;
13819 PyObject *dict;
13820
13821 enum PyUnicode_Kind fmtkind;
13822 Py_ssize_t fmtcnt, fmtpos;
13823 void *fmtdata;
13824 PyObject *fmtstr;
13825
13826 _PyUnicodeWriter writer;
13827};
13828
13829struct unicode_format_arg_t {
13830 Py_UCS4 ch;
13831 int flags;
13832 Py_ssize_t width;
13833 int prec;
13834 int sign;
13835};
13836
Guido van Rossumd57fd912000-03-10 22:53:23 +000013837static PyObject *
Victor Stinnera47082312012-10-04 02:19:54 +020013838unicode_format_getnextarg(struct unicode_formatter_t *ctx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013839{
Victor Stinnera47082312012-10-04 02:19:54 +020013840 Py_ssize_t argidx = ctx->argidx;
13841
13842 if (argidx < ctx->arglen) {
13843 ctx->argidx++;
13844 if (ctx->arglen < 0)
13845 return ctx->args;
Benjamin Peterson29060642009-01-31 22:14:21 +000013846 else
Victor Stinnera47082312012-10-04 02:19:54 +020013847 return PyTuple_GetItem(ctx->args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013848 }
13849 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013850 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013851 return NULL;
13852}
13853
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013854/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013855
Victor Stinnera47082312012-10-04 02:19:54 +020013856/* Format a float into the writer if the writer is not NULL, or into *p_output
13857 otherwise.
13858
13859 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +020013860static int
Victor Stinnera47082312012-10-04 02:19:54 +020013861formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
13862 PyObject **p_output,
13863 _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013864{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013865 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013866 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013867 Py_ssize_t len;
Victor Stinnera47082312012-10-04 02:19:54 +020013868 int prec;
13869 int dtoa_flags;
Tim Petersced69f82003-09-16 20:30:58 +000013870
Guido van Rossumd57fd912000-03-10 22:53:23 +000013871 x = PyFloat_AsDouble(v);
13872 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020013873 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013874
Victor Stinnera47082312012-10-04 02:19:54 +020013875 prec = arg->prec;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013876 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013877 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000013878
Victor Stinnera47082312012-10-04 02:19:54 +020013879 if (arg->flags & F_ALT)
13880 dtoa_flags = Py_DTSF_ALT;
13881 else
13882 dtoa_flags = 0;
13883 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013884 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020013885 return -1;
13886 len = strlen(p);
13887 if (writer) {
Victor Stinner4a587072013-11-19 12:54:53 +010013888 if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
Christian Heimesf4f99392012-09-10 11:48:41 +020013889 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013890 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020013891 }
Victor Stinnerd3f08822012-05-29 12:57:52 +020013892 }
13893 else
13894 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000013895 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013896 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013897}
13898
Victor Stinnerd0880d52012-04-27 23:40:13 +020013899/* formatlong() emulates the format codes d, u, o, x and X, and
13900 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
13901 * Python's regular ints.
13902 * Return value: a new PyUnicodeObject*, or NULL if error.
13903 * The output string is of the form
13904 * "-"? ("0x" | "0X")? digit+
13905 * "0x"/"0X" are present only for x and X conversions, with F_ALT
13906 * set in flags. The case of hex digits will be correct,
13907 * There will be at least prec digits, zero-filled on the left if
13908 * necessary to get that many.
13909 * val object to be converted
13910 * flags bitmask of format flags; only F_ALT is looked at
13911 * prec minimum number of digits; 0-fill on left if needed
13912 * type a character in [duoxX]; u acts the same as d
13913 *
13914 * CAUTION: o, x and X conversions on regular ints can never
13915 * produce a '-' sign, but can for Python's unbounded ints.
13916 */
Tim Peters38fd5b62000-09-21 05:43:11 +000013917static PyObject*
Victor Stinnera47082312012-10-04 02:19:54 +020013918formatlong(PyObject *val, struct unicode_format_arg_t *arg)
Tim Peters38fd5b62000-09-21 05:43:11 +000013919{
Victor Stinnerd0880d52012-04-27 23:40:13 +020013920 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013921 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013922 Py_ssize_t i;
13923 int sign; /* 1 if '-', else 0 */
13924 int len; /* number of characters */
13925 Py_ssize_t llen;
13926 int numdigits; /* len == numnondigits + numdigits */
13927 int numnondigits = 0;
Victor Stinnera47082312012-10-04 02:19:54 +020013928 int prec = arg->prec;
13929 int type = arg->ch;
Tim Peters38fd5b62000-09-21 05:43:11 +000013930
Victor Stinnerd0880d52012-04-27 23:40:13 +020013931 /* Avoid exceeding SSIZE_T_MAX */
13932 if (prec > INT_MAX-3) {
13933 PyErr_SetString(PyExc_OverflowError,
13934 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000013935 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013936 }
13937
13938 assert(PyLong_Check(val));
13939
13940 switch (type) {
Victor Stinner621ef3d2012-10-02 00:33:47 +020013941 default:
13942 assert(!"'type' not in [diuoxX]");
Victor Stinnerd0880d52012-04-27 23:40:13 +020013943 case 'd':
Victor Stinner621ef3d2012-10-02 00:33:47 +020013944 case 'i':
Victor Stinnerd0880d52012-04-27 23:40:13 +020013945 case 'u':
Ethan Furmanfb137212013-08-31 10:18:55 -070013946 /* int and int subclasses should print numerically when a numeric */
13947 /* format code is used (see issue18780) */
13948 result = PyNumber_ToBase(val, 10);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013949 break;
13950 case 'o':
13951 numnondigits = 2;
13952 result = PyNumber_ToBase(val, 8);
13953 break;
13954 case 'x':
13955 case 'X':
13956 numnondigits = 2;
13957 result = PyNumber_ToBase(val, 16);
13958 break;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013959 }
13960 if (!result)
13961 return NULL;
13962
13963 assert(unicode_modifiable(result));
13964 assert(PyUnicode_IS_READY(result));
13965 assert(PyUnicode_IS_ASCII(result));
13966
13967 /* To modify the string in-place, there can only be one reference. */
13968 if (Py_REFCNT(result) != 1) {
Christian Heimesd47802e2013-06-29 21:33:36 +020013969 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013970 PyErr_BadInternalCall();
13971 return NULL;
13972 }
13973 buf = PyUnicode_DATA(result);
13974 llen = PyUnicode_GET_LENGTH(result);
13975 if (llen > INT_MAX) {
Christian Heimesd47802e2013-06-29 21:33:36 +020013976 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013977 PyErr_SetString(PyExc_ValueError,
13978 "string too large in _PyBytes_FormatLong");
13979 return NULL;
13980 }
13981 len = (int)llen;
13982 sign = buf[0] == '-';
13983 numnondigits += sign;
13984 numdigits = len - numnondigits;
13985 assert(numdigits > 0);
13986
13987 /* Get rid of base marker unless F_ALT */
Victor Stinnera47082312012-10-04 02:19:54 +020013988 if (((arg->flags & F_ALT) == 0 &&
Victor Stinnerd0880d52012-04-27 23:40:13 +020013989 (type == 'o' || type == 'x' || type == 'X'))) {
13990 assert(buf[sign] == '0');
13991 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
13992 buf[sign+1] == 'o');
13993 numnondigits -= 2;
13994 buf += 2;
13995 len -= 2;
13996 if (sign)
13997 buf[0] = '-';
13998 assert(len == numnondigits + numdigits);
13999 assert(numdigits > 0);
14000 }
14001
14002 /* Fill with leading zeroes to meet minimum width. */
14003 if (prec > numdigits) {
14004 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
14005 numnondigits + prec);
14006 char *b1;
14007 if (!r1) {
14008 Py_DECREF(result);
14009 return NULL;
14010 }
14011 b1 = PyBytes_AS_STRING(r1);
14012 for (i = 0; i < numnondigits; ++i)
14013 *b1++ = *buf++;
14014 for (i = 0; i < prec - numdigits; i++)
14015 *b1++ = '0';
14016 for (i = 0; i < numdigits; i++)
14017 *b1++ = *buf++;
14018 *b1 = '\0';
14019 Py_DECREF(result);
14020 result = r1;
14021 buf = PyBytes_AS_STRING(result);
14022 len = numnondigits + prec;
14023 }
14024
14025 /* Fix up case for hex conversions. */
14026 if (type == 'X') {
14027 /* Need to convert all lower case letters to upper case.
14028 and need to convert 0x to 0X (and -0x to -0X). */
14029 for (i = 0; i < len; i++)
14030 if (buf[i] >= 'a' && buf[i] <= 'x')
14031 buf[i] -= 'a'-'A';
14032 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014033 if (!PyUnicode_Check(result)
14034 || buf != PyUnicode_DATA(result)) {
Victor Stinnerd0880d52012-04-27 23:40:13 +020014035 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014036 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014037 Py_DECREF(result);
14038 result = unicode;
14039 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014040 else if (len != PyUnicode_GET_LENGTH(result)) {
14041 if (PyUnicode_Resize(&result, len) < 0)
14042 Py_CLEAR(result);
14043 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000014044 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000014045}
14046
Ethan Furmandf3ed242014-01-05 06:50:30 -080014047/* Format an integer or a float as an integer.
Victor Stinner621ef3d2012-10-02 00:33:47 +020014048 * Return 1 if the number has been formatted into the writer,
Victor Stinnera47082312012-10-04 02:19:54 +020014049 * 0 if the number has been formatted into *p_output
Victor Stinner621ef3d2012-10-02 00:33:47 +020014050 * -1 and raise an exception on error */
14051static int
Victor Stinnera47082312012-10-04 02:19:54 +020014052mainformatlong(PyObject *v,
14053 struct unicode_format_arg_t *arg,
14054 PyObject **p_output,
14055 _PyUnicodeWriter *writer)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014056{
14057 PyObject *iobj, *res;
Victor Stinnera47082312012-10-04 02:19:54 +020014058 char type = (char)arg->ch;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014059
14060 if (!PyNumber_Check(v))
14061 goto wrongtype;
14062
Ethan Furmanf9bba9c2014-01-11 23:20:58 -080014063 /* make sure number is a type of integer */
Ethan Furmana70805e2014-01-12 08:42:35 -080014064 /* if not, issue deprecation warning for now */
Victor Stinner621ef3d2012-10-02 00:33:47 +020014065 if (!PyLong_Check(v)) {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014066 if (type == 'o' || type == 'x' || type == 'X') {
14067 iobj = PyNumber_Index(v);
14068 if (iobj == NULL) {
Ethan Furmanf9bba9c2014-01-11 23:20:58 -080014069 PyErr_Clear();
14070 if (PyErr_WarnEx(PyExc_DeprecationWarning,
14071 "automatic int conversions have been deprecated",
14072 1)) {
14073 return -1;
14074 }
14075 iobj = PyNumber_Long(v);
14076 if (iobj == NULL ) {
14077 if (PyErr_ExceptionMatches(PyExc_TypeError))
14078 goto wrongtype;
14079 return -1;
14080 }
Ethan Furmandf3ed242014-01-05 06:50:30 -080014081 }
14082 }
14083 else {
14084 iobj = PyNumber_Long(v);
14085 if (iobj == NULL ) {
14086 if (PyErr_ExceptionMatches(PyExc_TypeError))
14087 goto wrongtype;
14088 return -1;
14089 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014090 }
14091 assert(PyLong_Check(iobj));
14092 }
14093 else {
14094 iobj = v;
14095 Py_INCREF(iobj);
14096 }
14097
14098 if (PyLong_CheckExact(v)
Victor Stinnera47082312012-10-04 02:19:54 +020014099 && arg->width == -1 && arg->prec == -1
14100 && !(arg->flags & (F_SIGN | F_BLANK))
14101 && type != 'X')
Victor Stinner621ef3d2012-10-02 00:33:47 +020014102 {
14103 /* Fast path */
Victor Stinnera47082312012-10-04 02:19:54 +020014104 int alternate = arg->flags & F_ALT;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014105 int base;
14106
Victor Stinnera47082312012-10-04 02:19:54 +020014107 switch(type)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014108 {
14109 default:
14110 assert(0 && "'type' not in [diuoxX]");
14111 case 'd':
14112 case 'i':
14113 case 'u':
14114 base = 10;
14115 break;
14116 case 'o':
14117 base = 8;
14118 break;
14119 case 'x':
14120 case 'X':
14121 base = 16;
14122 break;
14123 }
14124
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014125 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14126 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014127 return -1;
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014128 }
14129 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014130 return 1;
14131 }
14132
Victor Stinnera47082312012-10-04 02:19:54 +020014133 res = formatlong(iobj, arg);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014134 Py_DECREF(iobj);
14135 if (res == NULL)
14136 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014137 *p_output = res;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014138 return 0;
14139
14140wrongtype:
14141 PyErr_Format(PyExc_TypeError,
14142 "%%%c format: a number is required, "
Victor Stinnera47082312012-10-04 02:19:54 +020014143 "not %.200s",
14144 type, Py_TYPE(v)->tp_name);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014145 return -1;
14146}
14147
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014148static Py_UCS4
14149formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014150{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014151 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014152 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014153 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014154 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000014155 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014156 goto onError;
14157 }
14158 else {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014159 PyObject *iobj;
Benjamin Peterson29060642009-01-31 22:14:21 +000014160 long x;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014161 /* make sure number is a type of integer */
Ethan Furmana70805e2014-01-12 08:42:35 -080014162 /* if not, issue deprecation warning for now */
Ethan Furmandf3ed242014-01-05 06:50:30 -080014163 if (!PyLong_Check(v)) {
14164 iobj = PyNumber_Index(v);
14165 if (iobj == NULL) {
Ethan Furmanf9bba9c2014-01-11 23:20:58 -080014166 PyErr_Clear();
14167 if (PyErr_WarnEx(PyExc_DeprecationWarning,
14168 "automatic int conversions have been deprecated",
14169 1)) {
14170 return -1;
14171 }
14172 iobj = PyNumber_Long(v);
14173 if (iobj == NULL ) {
14174 if (PyErr_ExceptionMatches(PyExc_TypeError))
14175 goto onError;
14176 return -1;
14177 }
Ethan Furmandf3ed242014-01-05 06:50:30 -080014178 }
14179 v = iobj;
14180 Py_DECREF(iobj);
14181 }
14182 /* Integer input truncated to a character */
Benjamin Peterson29060642009-01-31 22:14:21 +000014183 x = PyLong_AsLong(v);
14184 if (x == -1 && PyErr_Occurred())
14185 goto onError;
14186
Victor Stinner8faf8212011-12-08 22:14:11 +010014187 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014188 PyErr_SetString(PyExc_OverflowError,
14189 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014190 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000014191 }
14192
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014193 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014194 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014195
Benjamin Peterson29060642009-01-31 22:14:21 +000014196 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014197 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014198 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014199 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014200}
14201
Victor Stinnera47082312012-10-04 02:19:54 +020014202/* Parse options of an argument: flags, width, precision.
14203 Handle also "%(name)" syntax.
14204
14205 Return 0 if the argument has been formatted into arg->str.
14206 Return 1 if the argument has been written into ctx->writer,
14207 Raise an exception and return -1 on error. */
14208static int
14209unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14210 struct unicode_format_arg_t *arg)
14211{
14212#define FORMAT_READ(ctx) \
14213 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14214
14215 PyObject *v;
14216
Victor Stinnera47082312012-10-04 02:19:54 +020014217 if (arg->ch == '(') {
14218 /* Get argument value from a dictionary. Example: "%(name)s". */
14219 Py_ssize_t keystart;
14220 Py_ssize_t keylen;
14221 PyObject *key;
14222 int pcount = 1;
14223
14224 if (ctx->dict == NULL) {
14225 PyErr_SetString(PyExc_TypeError,
14226 "format requires a mapping");
14227 return -1;
14228 }
14229 ++ctx->fmtpos;
14230 --ctx->fmtcnt;
14231 keystart = ctx->fmtpos;
14232 /* Skip over balanced parentheses */
14233 while (pcount > 0 && --ctx->fmtcnt >= 0) {
14234 arg->ch = FORMAT_READ(ctx);
14235 if (arg->ch == ')')
14236 --pcount;
14237 else if (arg->ch == '(')
14238 ++pcount;
14239 ctx->fmtpos++;
14240 }
14241 keylen = ctx->fmtpos - keystart - 1;
14242 if (ctx->fmtcnt < 0 || pcount > 0) {
14243 PyErr_SetString(PyExc_ValueError,
14244 "incomplete format key");
14245 return -1;
14246 }
14247 key = PyUnicode_Substring(ctx->fmtstr,
14248 keystart, keystart + keylen);
14249 if (key == NULL)
14250 return -1;
14251 if (ctx->args_owned) {
14252 Py_DECREF(ctx->args);
14253 ctx->args_owned = 0;
14254 }
14255 ctx->args = PyObject_GetItem(ctx->dict, key);
14256 Py_DECREF(key);
14257 if (ctx->args == NULL)
14258 return -1;
14259 ctx->args_owned = 1;
14260 ctx->arglen = -1;
14261 ctx->argidx = -2;
14262 }
14263
14264 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
Victor Stinnera47082312012-10-04 02:19:54 +020014265 while (--ctx->fmtcnt >= 0) {
14266 arg->ch = FORMAT_READ(ctx);
14267 ctx->fmtpos++;
14268 switch (arg->ch) {
14269 case '-': arg->flags |= F_LJUST; continue;
14270 case '+': arg->flags |= F_SIGN; continue;
14271 case ' ': arg->flags |= F_BLANK; continue;
14272 case '#': arg->flags |= F_ALT; continue;
14273 case '0': arg->flags |= F_ZERO; continue;
14274 }
14275 break;
14276 }
14277
14278 /* Parse width. Example: "%10s" => width=10 */
Victor Stinnera47082312012-10-04 02:19:54 +020014279 if (arg->ch == '*') {
14280 v = unicode_format_getnextarg(ctx);
14281 if (v == NULL)
14282 return -1;
14283 if (!PyLong_Check(v)) {
14284 PyErr_SetString(PyExc_TypeError,
14285 "* wants int");
14286 return -1;
14287 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014288 arg->width = PyLong_AsSsize_t(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014289 if (arg->width == -1 && PyErr_Occurred())
14290 return -1;
14291 if (arg->width < 0) {
14292 arg->flags |= F_LJUST;
14293 arg->width = -arg->width;
14294 }
14295 if (--ctx->fmtcnt >= 0) {
14296 arg->ch = FORMAT_READ(ctx);
14297 ctx->fmtpos++;
14298 }
14299 }
14300 else if (arg->ch >= '0' && arg->ch <= '9') {
14301 arg->width = arg->ch - '0';
14302 while (--ctx->fmtcnt >= 0) {
14303 arg->ch = FORMAT_READ(ctx);
14304 ctx->fmtpos++;
14305 if (arg->ch < '0' || arg->ch > '9')
14306 break;
14307 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14308 mixing signed and unsigned comparison. Since arg->ch is between
14309 '0' and '9', casting to int is safe. */
14310 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14311 PyErr_SetString(PyExc_ValueError,
14312 "width too big");
14313 return -1;
14314 }
14315 arg->width = arg->width*10 + (arg->ch - '0');
14316 }
14317 }
14318
14319 /* Parse precision. Example: "%.3f" => prec=3 */
Victor Stinnera47082312012-10-04 02:19:54 +020014320 if (arg->ch == '.') {
14321 arg->prec = 0;
14322 if (--ctx->fmtcnt >= 0) {
14323 arg->ch = FORMAT_READ(ctx);
14324 ctx->fmtpos++;
14325 }
14326 if (arg->ch == '*') {
14327 v = unicode_format_getnextarg(ctx);
14328 if (v == NULL)
14329 return -1;
14330 if (!PyLong_Check(v)) {
14331 PyErr_SetString(PyExc_TypeError,
14332 "* wants int");
14333 return -1;
14334 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014335 arg->prec = _PyLong_AsInt(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014336 if (arg->prec == -1 && PyErr_Occurred())
14337 return -1;
14338 if (arg->prec < 0)
14339 arg->prec = 0;
14340 if (--ctx->fmtcnt >= 0) {
14341 arg->ch = FORMAT_READ(ctx);
14342 ctx->fmtpos++;
14343 }
14344 }
14345 else if (arg->ch >= '0' && arg->ch <= '9') {
14346 arg->prec = arg->ch - '0';
14347 while (--ctx->fmtcnt >= 0) {
14348 arg->ch = FORMAT_READ(ctx);
14349 ctx->fmtpos++;
14350 if (arg->ch < '0' || arg->ch > '9')
14351 break;
14352 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
14353 PyErr_SetString(PyExc_ValueError,
Victor Stinner3921e902012-10-06 23:05:00 +020014354 "precision too big");
Victor Stinnera47082312012-10-04 02:19:54 +020014355 return -1;
14356 }
14357 arg->prec = arg->prec*10 + (arg->ch - '0');
14358 }
14359 }
14360 }
14361
14362 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
14363 if (ctx->fmtcnt >= 0) {
14364 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
14365 if (--ctx->fmtcnt >= 0) {
14366 arg->ch = FORMAT_READ(ctx);
14367 ctx->fmtpos++;
14368 }
14369 }
14370 }
14371 if (ctx->fmtcnt < 0) {
14372 PyErr_SetString(PyExc_ValueError,
14373 "incomplete format");
14374 return -1;
14375 }
14376 return 0;
14377
14378#undef FORMAT_READ
14379}
14380
14381/* Format one argument. Supported conversion specifiers:
14382
14383 - "s", "r", "a": any type
Ethan Furmandf3ed242014-01-05 06:50:30 -080014384 - "i", "d", "u": int or float
14385 - "o", "x", "X": int
Victor Stinnera47082312012-10-04 02:19:54 +020014386 - "e", "E", "f", "F", "g", "G": float
14387 - "c": int or str (1 character)
14388
Victor Stinner8dbd4212012-12-04 09:30:24 +010014389 When possible, the output is written directly into the Unicode writer
14390 (ctx->writer). A string is created when padding is required.
14391
Victor Stinnera47082312012-10-04 02:19:54 +020014392 Return 0 if the argument has been formatted into *p_str,
14393 1 if the argument has been written into ctx->writer,
Victor Stinner8dbd4212012-12-04 09:30:24 +010014394 -1 on error. */
Victor Stinnera47082312012-10-04 02:19:54 +020014395static int
14396unicode_format_arg_format(struct unicode_formatter_t *ctx,
14397 struct unicode_format_arg_t *arg,
14398 PyObject **p_str)
14399{
14400 PyObject *v;
14401 _PyUnicodeWriter *writer = &ctx->writer;
14402
14403 if (ctx->fmtcnt == 0)
14404 ctx->writer.overallocate = 0;
14405
14406 if (arg->ch == '%') {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014407 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014408 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014409 return 1;
14410 }
14411
14412 v = unicode_format_getnextarg(ctx);
14413 if (v == NULL)
14414 return -1;
14415
Victor Stinnera47082312012-10-04 02:19:54 +020014416
14417 switch (arg->ch) {
Victor Stinnera47082312012-10-04 02:19:54 +020014418 case 's':
14419 case 'r':
14420 case 'a':
14421 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
14422 /* Fast path */
14423 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
14424 return -1;
14425 return 1;
14426 }
14427
14428 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
14429 *p_str = v;
14430 Py_INCREF(*p_str);
14431 }
14432 else {
14433 if (arg->ch == 's')
14434 *p_str = PyObject_Str(v);
14435 else if (arg->ch == 'r')
14436 *p_str = PyObject_Repr(v);
14437 else
14438 *p_str = PyObject_ASCII(v);
14439 }
14440 break;
14441
14442 case 'i':
14443 case 'd':
14444 case 'u':
14445 case 'o':
14446 case 'x':
14447 case 'X':
14448 {
14449 int ret = mainformatlong(v, arg, p_str, writer);
14450 if (ret != 0)
14451 return ret;
14452 arg->sign = 1;
14453 break;
14454 }
14455
14456 case 'e':
14457 case 'E':
14458 case 'f':
14459 case 'F':
14460 case 'g':
14461 case 'G':
14462 if (arg->width == -1 && arg->prec == -1
14463 && !(arg->flags & (F_SIGN | F_BLANK)))
14464 {
14465 /* Fast path */
14466 if (formatfloat(v, arg, NULL, writer) == -1)
14467 return -1;
14468 return 1;
14469 }
14470
14471 arg->sign = 1;
14472 if (formatfloat(v, arg, p_str, NULL) == -1)
14473 return -1;
14474 break;
14475
14476 case 'c':
14477 {
14478 Py_UCS4 ch = formatchar(v);
14479 if (ch == (Py_UCS4) -1)
14480 return -1;
14481 if (arg->width == -1 && arg->prec == -1) {
14482 /* Fast path */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014483 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014484 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014485 return 1;
14486 }
14487 *p_str = PyUnicode_FromOrdinal(ch);
14488 break;
14489 }
14490
14491 default:
14492 PyErr_Format(PyExc_ValueError,
14493 "unsupported format character '%c' (0x%x) "
Victor Stinnera33bce02014-07-04 22:47:46 +020014494 "at index %zd",
Victor Stinnera47082312012-10-04 02:19:54 +020014495 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
14496 (int)arg->ch,
14497 ctx->fmtpos - 1);
14498 return -1;
14499 }
14500 if (*p_str == NULL)
14501 return -1;
14502 assert (PyUnicode_Check(*p_str));
14503 return 0;
14504}
14505
14506static int
14507unicode_format_arg_output(struct unicode_formatter_t *ctx,
14508 struct unicode_format_arg_t *arg,
14509 PyObject *str)
14510{
14511 Py_ssize_t len;
14512 enum PyUnicode_Kind kind;
14513 void *pbuf;
14514 Py_ssize_t pindex;
14515 Py_UCS4 signchar;
14516 Py_ssize_t buflen;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014517 Py_UCS4 maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014518 Py_ssize_t sublen;
14519 _PyUnicodeWriter *writer = &ctx->writer;
14520 Py_UCS4 fill;
14521
14522 fill = ' ';
14523 if (arg->sign && arg->flags & F_ZERO)
14524 fill = '0';
14525
14526 if (PyUnicode_READY(str) == -1)
14527 return -1;
14528
14529 len = PyUnicode_GET_LENGTH(str);
14530 if ((arg->width == -1 || arg->width <= len)
14531 && (arg->prec == -1 || arg->prec >= len)
14532 && !(arg->flags & (F_SIGN | F_BLANK)))
14533 {
14534 /* Fast path */
14535 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
14536 return -1;
14537 return 0;
14538 }
14539
14540 /* Truncate the string for "s", "r" and "a" formats
14541 if the precision is set */
14542 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
14543 if (arg->prec >= 0 && len > arg->prec)
14544 len = arg->prec;
14545 }
14546
14547 /* Adjust sign and width */
14548 kind = PyUnicode_KIND(str);
14549 pbuf = PyUnicode_DATA(str);
14550 pindex = 0;
14551 signchar = '\0';
14552 if (arg->sign) {
14553 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
14554 if (ch == '-' || ch == '+') {
14555 signchar = ch;
14556 len--;
14557 pindex++;
14558 }
14559 else if (arg->flags & F_SIGN)
14560 signchar = '+';
14561 else if (arg->flags & F_BLANK)
14562 signchar = ' ';
14563 else
14564 arg->sign = 0;
14565 }
14566 if (arg->width < len)
14567 arg->width = len;
14568
14569 /* Prepare the writer */
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014570 maxchar = writer->maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014571 if (!(arg->flags & F_LJUST)) {
14572 if (arg->sign) {
14573 if ((arg->width-1) > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014574 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014575 }
14576 else {
14577 if (arg->width > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014578 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014579 }
14580 }
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014581 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
14582 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014583 maxchar = Py_MAX(maxchar, strmaxchar);
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014584 }
14585
Victor Stinnera47082312012-10-04 02:19:54 +020014586 buflen = arg->width;
14587 if (arg->sign && len == arg->width)
14588 buflen++;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014589 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
Victor Stinnera47082312012-10-04 02:19:54 +020014590 return -1;
14591
14592 /* Write the sign if needed */
14593 if (arg->sign) {
14594 if (fill != ' ') {
14595 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14596 writer->pos += 1;
14597 }
14598 if (arg->width > len)
14599 arg->width--;
14600 }
14601
14602 /* Write the numeric prefix for "x", "X" and "o" formats
14603 if the alternate form is used.
14604 For example, write "0x" for the "%#x" format. */
14605 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14606 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14607 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
14608 if (fill != ' ') {
14609 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14610 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14611 writer->pos += 2;
14612 pindex += 2;
14613 }
14614 arg->width -= 2;
14615 if (arg->width < 0)
14616 arg->width = 0;
14617 len -= 2;
14618 }
14619
14620 /* Pad left with the fill character if needed */
14621 if (arg->width > len && !(arg->flags & F_LJUST)) {
14622 sublen = arg->width - len;
14623 FILL(writer->kind, writer->data, fill, writer->pos, sublen);
14624 writer->pos += sublen;
14625 arg->width = len;
14626 }
14627
14628 /* If padding with spaces: write sign if needed and/or numeric prefix if
14629 the alternate form is used */
14630 if (fill == ' ') {
14631 if (arg->sign) {
14632 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14633 writer->pos += 1;
14634 }
14635 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14636 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14637 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
14638 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14639 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14640 writer->pos += 2;
14641 pindex += 2;
14642 }
14643 }
14644
14645 /* Write characters */
14646 if (len) {
14647 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14648 str, pindex, len);
14649 writer->pos += len;
14650 }
14651
14652 /* Pad right with the fill character if needed */
14653 if (arg->width > len) {
14654 sublen = arg->width - len;
14655 FILL(writer->kind, writer->data, ' ', writer->pos, sublen);
14656 writer->pos += sublen;
14657 }
14658 return 0;
14659}
14660
14661/* Helper of PyUnicode_Format(): format one arg.
14662 Return 0 on success, raise an exception and return -1 on error. */
14663static int
14664unicode_format_arg(struct unicode_formatter_t *ctx)
14665{
14666 struct unicode_format_arg_t arg;
14667 PyObject *str;
14668 int ret;
14669
Victor Stinner8dbd4212012-12-04 09:30:24 +010014670 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
14671 arg.flags = 0;
14672 arg.width = -1;
14673 arg.prec = -1;
14674 arg.sign = 0;
14675 str = NULL;
14676
Victor Stinnera47082312012-10-04 02:19:54 +020014677 ret = unicode_format_arg_parse(ctx, &arg);
14678 if (ret == -1)
14679 return -1;
14680
14681 ret = unicode_format_arg_format(ctx, &arg, &str);
14682 if (ret == -1)
14683 return -1;
14684
14685 if (ret != 1) {
14686 ret = unicode_format_arg_output(ctx, &arg, str);
14687 Py_DECREF(str);
14688 if (ret == -1)
14689 return -1;
14690 }
14691
14692 if (ctx->dict && (ctx->argidx < ctx->arglen) && arg.ch != '%') {
14693 PyErr_SetString(PyExc_TypeError,
14694 "not all arguments converted during string formatting");
14695 return -1;
14696 }
14697 return 0;
14698}
14699
Alexander Belopolsky40018472011-02-26 01:02:56 +000014700PyObject *
14701PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014702{
Victor Stinnera47082312012-10-04 02:19:54 +020014703 struct unicode_formatter_t ctx;
Tim Petersced69f82003-09-16 20:30:58 +000014704
Guido van Rossumd57fd912000-03-10 22:53:23 +000014705 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014706 PyErr_BadInternalCall();
14707 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014708 }
Victor Stinnera47082312012-10-04 02:19:54 +020014709
14710 ctx.fmtstr = PyUnicode_FromObject(format);
14711 if (ctx.fmtstr == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000014712 return NULL;
Victor Stinnera47082312012-10-04 02:19:54 +020014713 if (PyUnicode_READY(ctx.fmtstr) == -1) {
14714 Py_DECREF(ctx.fmtstr);
14715 return NULL;
14716 }
14717 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
14718 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
14719 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
14720 ctx.fmtpos = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014721
Victor Stinner8f674cc2013-04-17 23:02:17 +020014722 _PyUnicodeWriter_Init(&ctx.writer);
14723 ctx.writer.min_length = ctx.fmtcnt + 100;
14724 ctx.writer.overallocate = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014725
Guido van Rossumd57fd912000-03-10 22:53:23 +000014726 if (PyTuple_Check(args)) {
Victor Stinnera47082312012-10-04 02:19:54 +020014727 ctx.arglen = PyTuple_Size(args);
14728 ctx.argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014729 }
14730 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014731 ctx.arglen = -1;
14732 ctx.argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014733 }
Victor Stinnera47082312012-10-04 02:19:54 +020014734 ctx.args_owned = 0;
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040014735 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Victor Stinnera47082312012-10-04 02:19:54 +020014736 ctx.dict = args;
14737 else
14738 ctx.dict = NULL;
14739 ctx.args = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014740
Victor Stinnera47082312012-10-04 02:19:54 +020014741 while (--ctx.fmtcnt >= 0) {
14742 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
Victor Stinnercfc4c132013-04-03 01:48:39 +020014743 Py_ssize_t nonfmtpos;
Victor Stinnera47082312012-10-04 02:19:54 +020014744
14745 nonfmtpos = ctx.fmtpos++;
14746 while (ctx.fmtcnt >= 0 &&
14747 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
14748 ctx.fmtpos++;
14749 ctx.fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014750 }
Victor Stinnera47082312012-10-04 02:19:54 +020014751 if (ctx.fmtcnt < 0) {
14752 ctx.fmtpos--;
14753 ctx.writer.overallocate = 0;
Victor Stinnera0494432012-10-03 23:03:46 +020014754 }
Victor Stinneree4544c2012-05-09 22:24:08 +020014755
Victor Stinnercfc4c132013-04-03 01:48:39 +020014756 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
14757 nonfmtpos, ctx.fmtpos) < 0)
14758 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014759 }
14760 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014761 ctx.fmtpos++;
14762 if (unicode_format_arg(&ctx) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000014763 goto onError;
Victor Stinnera47082312012-10-04 02:19:54 +020014764 }
14765 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020014766
Victor Stinnera47082312012-10-04 02:19:54 +020014767 if (ctx.argidx < ctx.arglen && !ctx.dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014768 PyErr_SetString(PyExc_TypeError,
14769 "not all arguments converted during string formatting");
14770 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014771 }
14772
Victor Stinnera47082312012-10-04 02:19:54 +020014773 if (ctx.args_owned) {
14774 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014775 }
Victor Stinnera47082312012-10-04 02:19:54 +020014776 Py_DECREF(ctx.fmtstr);
14777 return _PyUnicodeWriter_Finish(&ctx.writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014778
Benjamin Peterson29060642009-01-31 22:14:21 +000014779 onError:
Victor Stinnera47082312012-10-04 02:19:54 +020014780 Py_DECREF(ctx.fmtstr);
14781 _PyUnicodeWriter_Dealloc(&ctx.writer);
14782 if (ctx.args_owned) {
14783 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014784 }
14785 return NULL;
14786}
14787
Jeremy Hylton938ace62002-07-17 16:30:39 +000014788static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000014789unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
14790
Tim Peters6d6c1a32001-08-02 04:15:00 +000014791static PyObject *
14792unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14793{
Benjamin Peterson29060642009-01-31 22:14:21 +000014794 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014795 static char *kwlist[] = {"object", "encoding", "errors", 0};
14796 char *encoding = NULL;
14797 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000014798
Benjamin Peterson14339b62009-01-31 16:36:08 +000014799 if (type != &PyUnicode_Type)
14800 return unicode_subtype_new(type, args, kwds);
14801 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000014802 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000014803 return NULL;
14804 if (x == NULL)
Serhiy Storchaka678db842013-01-26 12:16:36 +020014805 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson14339b62009-01-31 16:36:08 +000014806 if (encoding == NULL && errors == NULL)
14807 return PyObject_Str(x);
14808 else
Benjamin Peterson29060642009-01-31 22:14:21 +000014809 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000014810}
14811
Guido van Rossume023fe02001-08-30 03:12:59 +000014812static PyObject *
14813unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14814{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014815 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014816 Py_ssize_t length, char_size;
14817 int share_wstr, share_utf8;
14818 unsigned int kind;
14819 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000014820
Benjamin Peterson14339b62009-01-31 16:36:08 +000014821 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014822
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014823 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014824 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014825 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014826 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050014827 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060014828 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014829 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060014830 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014831
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014832 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014833 if (self == NULL) {
14834 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014835 return NULL;
14836 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014837 kind = PyUnicode_KIND(unicode);
14838 length = PyUnicode_GET_LENGTH(unicode);
14839
14840 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014841#ifdef Py_DEBUG
14842 _PyUnicode_HASH(self) = -1;
14843#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014844 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014845#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014846 _PyUnicode_STATE(self).interned = 0;
14847 _PyUnicode_STATE(self).kind = kind;
14848 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020014849 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014850 _PyUnicode_STATE(self).ready = 1;
14851 _PyUnicode_WSTR(self) = NULL;
14852 _PyUnicode_UTF8_LENGTH(self) = 0;
14853 _PyUnicode_UTF8(self) = NULL;
14854 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020014855 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014856
14857 share_utf8 = 0;
14858 share_wstr = 0;
14859 if (kind == PyUnicode_1BYTE_KIND) {
14860 char_size = 1;
14861 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
14862 share_utf8 = 1;
14863 }
14864 else if (kind == PyUnicode_2BYTE_KIND) {
14865 char_size = 2;
14866 if (sizeof(wchar_t) == 2)
14867 share_wstr = 1;
14868 }
14869 else {
14870 assert(kind == PyUnicode_4BYTE_KIND);
14871 char_size = 4;
14872 if (sizeof(wchar_t) == 4)
14873 share_wstr = 1;
14874 }
14875
14876 /* Ensure we won't overflow the length. */
14877 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
14878 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014879 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014880 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014881 data = PyObject_MALLOC((length + 1) * char_size);
14882 if (data == NULL) {
14883 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014884 goto onError;
14885 }
14886
Victor Stinnerc3c74152011-10-02 20:39:55 +020014887 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014888 if (share_utf8) {
14889 _PyUnicode_UTF8_LENGTH(self) = length;
14890 _PyUnicode_UTF8(self) = data;
14891 }
14892 if (share_wstr) {
14893 _PyUnicode_WSTR_LENGTH(self) = length;
14894 _PyUnicode_WSTR(self) = (wchar_t *)data;
14895 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014896
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014897 Py_MEMCPY(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020014898 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014899 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014900#ifdef Py_DEBUG
14901 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
14902#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020014903 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010014904 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014905
14906onError:
14907 Py_DECREF(unicode);
14908 Py_DECREF(self);
14909 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000014910}
14911
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000014912PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070014913"str(object='') -> str\n\
14914str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000014915\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100014916Create a new string object from the given object. If encoding or\n\
14917errors is specified, then the object must expose a data buffer\n\
14918that will be decoded using the given encoding and error handler.\n\
14919Otherwise, returns the result of object.__str__() (if defined)\n\
14920or repr(object).\n\
14921encoding defaults to sys.getdefaultencoding().\n\
14922errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000014923
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014924static PyObject *unicode_iter(PyObject *seq);
14925
Guido van Rossumd57fd912000-03-10 22:53:23 +000014926PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000014927 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014928 "str", /* tp_name */
14929 sizeof(PyUnicodeObject), /* tp_size */
14930 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014931 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014932 (destructor)unicode_dealloc, /* tp_dealloc */
14933 0, /* tp_print */
14934 0, /* tp_getattr */
14935 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014936 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014937 unicode_repr, /* tp_repr */
14938 &unicode_as_number, /* tp_as_number */
14939 &unicode_as_sequence, /* tp_as_sequence */
14940 &unicode_as_mapping, /* tp_as_mapping */
14941 (hashfunc) unicode_hash, /* tp_hash*/
14942 0, /* tp_call*/
14943 (reprfunc) unicode_str, /* tp_str */
14944 PyObject_GenericGetAttr, /* tp_getattro */
14945 0, /* tp_setattro */
14946 0, /* tp_as_buffer */
14947 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000014948 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014949 unicode_doc, /* tp_doc */
14950 0, /* tp_traverse */
14951 0, /* tp_clear */
14952 PyUnicode_RichCompare, /* tp_richcompare */
14953 0, /* tp_weaklistoffset */
14954 unicode_iter, /* tp_iter */
14955 0, /* tp_iternext */
14956 unicode_methods, /* tp_methods */
14957 0, /* tp_members */
14958 0, /* tp_getset */
14959 &PyBaseObject_Type, /* tp_base */
14960 0, /* tp_dict */
14961 0, /* tp_descr_get */
14962 0, /* tp_descr_set */
14963 0, /* tp_dictoffset */
14964 0, /* tp_init */
14965 0, /* tp_alloc */
14966 unicode_new, /* tp_new */
14967 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014968};
14969
14970/* Initialize the Unicode implementation */
14971
Victor Stinner3a50e702011-10-18 21:21:00 +020014972int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014973{
Thomas Wouters477c8d52006-05-27 19:21:47 +000014974 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014975 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000014976 0x000A, /* LINE FEED */
14977 0x000D, /* CARRIAGE RETURN */
14978 0x001C, /* FILE SEPARATOR */
14979 0x001D, /* GROUP SEPARATOR */
14980 0x001E, /* RECORD SEPARATOR */
14981 0x0085, /* NEXT LINE */
14982 0x2028, /* LINE SEPARATOR */
14983 0x2029, /* PARAGRAPH SEPARATOR */
14984 };
14985
Fred Drakee4315f52000-05-09 19:53:39 +000014986 /* Init the implementation */
Serhiy Storchaka678db842013-01-26 12:16:36 +020014987 _Py_INCREF_UNICODE_EMPTY();
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014988 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014989 Py_FatalError("Can't create empty string");
Serhiy Storchaka678db842013-01-26 12:16:36 +020014990 Py_DECREF(unicode_empty);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014991
Guido van Rossumcacfc072002-05-24 19:01:59 +000014992 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014993 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000014994
14995 /* initialize the linebreak bloom filter */
14996 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014997 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020014998 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014999
Christian Heimes26532f72013-07-20 14:57:16 +020015000 if (PyType_Ready(&EncodingMapType) < 0)
15001 Py_FatalError("Can't initialize encoding map type");
Victor Stinner3a50e702011-10-18 21:21:00 +020015002
Benjamin Petersonc4311282012-10-30 23:21:10 -040015003 if (PyType_Ready(&PyFieldNameIter_Type) < 0)
15004 Py_FatalError("Can't initialize field name iterator type");
15005
15006 if (PyType_Ready(&PyFormatterIter_Type) < 0)
15007 Py_FatalError("Can't initialize formatter iter type");
Benjamin Petersone8ea97f2012-10-30 23:27:52 -040015008
Victor Stinner3a50e702011-10-18 21:21:00 +020015009#ifdef HAVE_MBCS
15010 winver.dwOSVersionInfoSize = sizeof(winver);
15011 if (!GetVersionEx((OSVERSIONINFO*)&winver)) {
15012 PyErr_SetFromWindowsErr(0);
15013 return -1;
15014 }
15015#endif
15016 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015017}
15018
15019/* Finalize the Unicode implementation */
15020
Christian Heimesa156e092008-02-16 07:38:31 +000015021int
15022PyUnicode_ClearFreeList(void)
15023{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015024 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000015025}
15026
Guido van Rossumd57fd912000-03-10 22:53:23 +000015027void
Thomas Wouters78890102000-07-22 19:25:51 +000015028_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015029{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000015030 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015031
Serhiy Storchaka05997252013-01-26 12:14:02 +020015032 Py_CLEAR(unicode_empty);
Barry Warsaw5b4c2282000-10-03 20:45:26 +000015033
Serhiy Storchaka05997252013-01-26 12:14:02 +020015034 for (i = 0; i < 256; i++)
15035 Py_CLEAR(unicode_latin1[i]);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020015036 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000015037 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000015038}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000015039
Walter Dörwald16807132007-05-25 13:52:07 +000015040void
15041PyUnicode_InternInPlace(PyObject **p)
15042{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015043 PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015044 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020015045#ifdef Py_DEBUG
15046 assert(s != NULL);
15047 assert(_PyUnicode_CHECK(s));
15048#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000015049 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020015050 return;
15051#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015052 /* If it's a subclass, we don't really know what putting
15053 it in the interned dict might do. */
15054 if (!PyUnicode_CheckExact(s))
15055 return;
15056 if (PyUnicode_CHECK_INTERNED(s))
15057 return;
15058 if (interned == NULL) {
15059 interned = PyDict_New();
15060 if (interned == NULL) {
15061 PyErr_Clear(); /* Don't leave an exception */
15062 return;
15063 }
15064 }
15065 /* It might be that the GetItem call fails even
15066 though the key is present in the dictionary,
15067 namely when this happens during a stack overflow. */
15068 Py_ALLOW_RECURSION
Victor Stinner7931d9a2011-11-04 00:22:48 +010015069 t = PyDict_GetItem(interned, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015070 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000015071
Victor Stinnerf0335102013-04-14 19:13:03 +020015072 if (t) {
15073 Py_INCREF(t);
15074 Py_DECREF(*p);
15075 *p = t;
15076 return;
15077 }
Walter Dörwald16807132007-05-25 13:52:07 +000015078
Benjamin Peterson14339b62009-01-31 16:36:08 +000015079 PyThreadState_GET()->recursion_critical = 1;
Victor Stinner7931d9a2011-11-04 00:22:48 +010015080 if (PyDict_SetItem(interned, s, s) < 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015081 PyErr_Clear();
15082 PyThreadState_GET()->recursion_critical = 0;
15083 return;
15084 }
15085 PyThreadState_GET()->recursion_critical = 0;
15086 /* The two references in interned are not counted by refcnt.
15087 The deallocator will take care of this */
15088 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015089 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000015090}
15091
15092void
15093PyUnicode_InternImmortal(PyObject **p)
15094{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015095 PyUnicode_InternInPlace(p);
15096 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020015097 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015098 Py_INCREF(*p);
15099 }
Walter Dörwald16807132007-05-25 13:52:07 +000015100}
15101
15102PyObject *
15103PyUnicode_InternFromString(const char *cp)
15104{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015105 PyObject *s = PyUnicode_FromString(cp);
15106 if (s == NULL)
15107 return NULL;
15108 PyUnicode_InternInPlace(&s);
15109 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000015110}
15111
Alexander Belopolsky40018472011-02-26 01:02:56 +000015112void
15113_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000015114{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015115 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015116 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015117 Py_ssize_t i, n;
15118 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000015119
Benjamin Peterson14339b62009-01-31 16:36:08 +000015120 if (interned == NULL || !PyDict_Check(interned))
15121 return;
15122 keys = PyDict_Keys(interned);
15123 if (keys == NULL || !PyList_Check(keys)) {
15124 PyErr_Clear();
15125 return;
15126 }
Walter Dörwald16807132007-05-25 13:52:07 +000015127
Benjamin Peterson14339b62009-01-31 16:36:08 +000015128 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
15129 detector, interned unicode strings are not forcibly deallocated;
15130 rather, we give them their stolen references back, and then clear
15131 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000015132
Benjamin Peterson14339b62009-01-31 16:36:08 +000015133 n = PyList_GET_SIZE(keys);
15134 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000015135 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015136 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015137 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015138 if (PyUnicode_READY(s) == -1) {
15139 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015140 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015141 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015142 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015143 case SSTATE_NOT_INTERNED:
15144 /* XXX Shouldn't happen */
15145 break;
15146 case SSTATE_INTERNED_IMMORTAL:
15147 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015148 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015149 break;
15150 case SSTATE_INTERNED_MORTAL:
15151 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015152 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015153 break;
15154 default:
15155 Py_FatalError("Inconsistent interned string state.");
15156 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015157 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015158 }
15159 fprintf(stderr, "total size of all interned strings: "
15160 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
15161 "mortal/immortal\n", mortal_size, immortal_size);
15162 Py_DECREF(keys);
15163 PyDict_Clear(interned);
Serhiy Storchaka05997252013-01-26 12:14:02 +020015164 Py_CLEAR(interned);
Walter Dörwald16807132007-05-25 13:52:07 +000015165}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015166
15167
15168/********************* Unicode Iterator **************************/
15169
15170typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015171 PyObject_HEAD
15172 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015173 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015174} unicodeiterobject;
15175
15176static void
15177unicodeiter_dealloc(unicodeiterobject *it)
15178{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015179 _PyObject_GC_UNTRACK(it);
15180 Py_XDECREF(it->it_seq);
15181 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015182}
15183
15184static int
15185unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15186{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015187 Py_VISIT(it->it_seq);
15188 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015189}
15190
15191static PyObject *
15192unicodeiter_next(unicodeiterobject *it)
15193{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015194 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015195
Benjamin Peterson14339b62009-01-31 16:36:08 +000015196 assert(it != NULL);
15197 seq = it->it_seq;
15198 if (seq == NULL)
15199 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015200 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015201
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015202 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15203 int kind = PyUnicode_KIND(seq);
15204 void *data = PyUnicode_DATA(seq);
15205 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15206 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015207 if (item != NULL)
15208 ++it->it_index;
15209 return item;
15210 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015211
Benjamin Peterson14339b62009-01-31 16:36:08 +000015212 Py_DECREF(seq);
15213 it->it_seq = NULL;
15214 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015215}
15216
15217static PyObject *
15218unicodeiter_len(unicodeiterobject *it)
15219{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015220 Py_ssize_t len = 0;
15221 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020015222 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015223 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015224}
15225
15226PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15227
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015228static PyObject *
15229unicodeiter_reduce(unicodeiterobject *it)
15230{
15231 if (it->it_seq != NULL) {
Antoine Pitroua7013882012-04-05 00:04:20 +020015232 return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015233 it->it_seq, it->it_index);
15234 } else {
15235 PyObject *u = PyUnicode_FromUnicode(NULL, 0);
15236 if (u == NULL)
15237 return NULL;
Antoine Pitroua7013882012-04-05 00:04:20 +020015238 return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015239 }
15240}
15241
15242PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15243
15244static PyObject *
15245unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15246{
15247 Py_ssize_t index = PyLong_AsSsize_t(state);
15248 if (index == -1 && PyErr_Occurred())
15249 return NULL;
Kristján Valur Jónsson25dded02014-03-05 13:47:57 +000015250 if (it->it_seq != NULL) {
15251 if (index < 0)
15252 index = 0;
15253 else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15254 index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15255 it->it_index = index;
15256 }
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015257 Py_RETURN_NONE;
15258}
15259
15260PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15261
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015262static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015263 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000015264 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015265 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15266 reduce_doc},
15267 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
15268 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000015269 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015270};
15271
15272PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015273 PyVarObject_HEAD_INIT(&PyType_Type, 0)
15274 "str_iterator", /* tp_name */
15275 sizeof(unicodeiterobject), /* tp_basicsize */
15276 0, /* tp_itemsize */
15277 /* methods */
15278 (destructor)unicodeiter_dealloc, /* tp_dealloc */
15279 0, /* tp_print */
15280 0, /* tp_getattr */
15281 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000015282 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015283 0, /* tp_repr */
15284 0, /* tp_as_number */
15285 0, /* tp_as_sequence */
15286 0, /* tp_as_mapping */
15287 0, /* tp_hash */
15288 0, /* tp_call */
15289 0, /* tp_str */
15290 PyObject_GenericGetAttr, /* tp_getattro */
15291 0, /* tp_setattro */
15292 0, /* tp_as_buffer */
15293 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15294 0, /* tp_doc */
15295 (traverseproc)unicodeiter_traverse, /* tp_traverse */
15296 0, /* tp_clear */
15297 0, /* tp_richcompare */
15298 0, /* tp_weaklistoffset */
15299 PyObject_SelfIter, /* tp_iter */
15300 (iternextfunc)unicodeiter_next, /* tp_iternext */
15301 unicodeiter_methods, /* tp_methods */
15302 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015303};
15304
15305static PyObject *
15306unicode_iter(PyObject *seq)
15307{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015308 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015309
Benjamin Peterson14339b62009-01-31 16:36:08 +000015310 if (!PyUnicode_Check(seq)) {
15311 PyErr_BadInternalCall();
15312 return NULL;
15313 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015314 if (PyUnicode_READY(seq) == -1)
15315 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015316 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15317 if (it == NULL)
15318 return NULL;
15319 it->it_index = 0;
15320 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015321 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015322 _PyObject_GC_TRACK(it);
15323 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015324}
15325
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015326
15327size_t
15328Py_UNICODE_strlen(const Py_UNICODE *u)
15329{
15330 int res = 0;
15331 while(*u++)
15332 res++;
15333 return res;
15334}
15335
15336Py_UNICODE*
15337Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
15338{
15339 Py_UNICODE *u = s1;
15340 while ((*u++ = *s2++));
15341 return s1;
15342}
15343
15344Py_UNICODE*
15345Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15346{
15347 Py_UNICODE *u = s1;
15348 while ((*u++ = *s2++))
15349 if (n-- == 0)
15350 break;
15351 return s1;
15352}
15353
15354Py_UNICODE*
15355Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
15356{
15357 Py_UNICODE *u1 = s1;
15358 u1 += Py_UNICODE_strlen(u1);
15359 Py_UNICODE_strcpy(u1, s2);
15360 return s1;
15361}
15362
15363int
15364Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
15365{
15366 while (*s1 && *s2 && *s1 == *s2)
15367 s1++, s2++;
15368 if (*s1 && *s2)
15369 return (*s1 < *s2) ? -1 : +1;
15370 if (*s1)
15371 return 1;
15372 if (*s2)
15373 return -1;
15374 return 0;
15375}
15376
15377int
15378Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15379{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015380 Py_UNICODE u1, u2;
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015381 for (; n != 0; n--) {
15382 u1 = *s1;
15383 u2 = *s2;
15384 if (u1 != u2)
15385 return (u1 < u2) ? -1 : +1;
15386 if (u1 == '\0')
15387 return 0;
15388 s1++;
15389 s2++;
15390 }
15391 return 0;
15392}
15393
15394Py_UNICODE*
15395Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
15396{
15397 const Py_UNICODE *p;
15398 for (p = s; *p; p++)
15399 if (*p == c)
15400 return (Py_UNICODE*)p;
15401 return NULL;
15402}
15403
15404Py_UNICODE*
15405Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
15406{
15407 const Py_UNICODE *p;
15408 p = s + Py_UNICODE_strlen(s);
15409 while (p != s) {
15410 p--;
15411 if (*p == c)
15412 return (Py_UNICODE*)p;
15413 }
15414 return NULL;
15415}
Victor Stinner331ea922010-08-10 16:37:20 +000015416
Victor Stinner71133ff2010-09-01 23:43:53 +000015417Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015418PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000015419{
Victor Stinner577db2c2011-10-11 22:12:48 +020015420 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015421 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000015422
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015423 if (!PyUnicode_Check(unicode)) {
15424 PyErr_BadArgument();
15425 return NULL;
15426 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015427 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020015428 if (u == NULL)
15429 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000015430 /* Ensure we won't overflow the size. */
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015431 if (len > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000015432 PyErr_NoMemory();
15433 return NULL;
15434 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015435 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000015436 size *= sizeof(Py_UNICODE);
15437 copy = PyMem_Malloc(size);
15438 if (copy == NULL) {
15439 PyErr_NoMemory();
15440 return NULL;
15441 }
Victor Stinner577db2c2011-10-11 22:12:48 +020015442 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000015443 return copy;
15444}
Martin v. Löwis5b222132007-06-10 09:51:05 +000015445
Georg Brandl66c221e2010-10-14 07:04:07 +000015446/* A _string module, to export formatter_parser and formatter_field_name_split
15447 to the string.Formatter class implemented in Python. */
15448
15449static PyMethodDef _string_methods[] = {
15450 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
15451 METH_O, PyDoc_STR("split the argument as a field name")},
15452 {"formatter_parser", (PyCFunction) formatter_parser,
15453 METH_O, PyDoc_STR("parse the argument as a format string")},
15454 {NULL, NULL}
15455};
15456
15457static struct PyModuleDef _string_module = {
15458 PyModuleDef_HEAD_INIT,
15459 "_string",
15460 PyDoc_STR("string helper module"),
15461 0,
15462 _string_methods,
15463 NULL,
15464 NULL,
15465 NULL,
15466 NULL
15467};
15468
15469PyMODINIT_FUNC
15470PyInit__string(void)
15471{
15472 return PyModule_Create(&_string_module);
15473}
15474
15475
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000015476#ifdef __cplusplus
15477}
15478#endif