blob: 84ab6a114cd3eefeef2b112cc36c880df900c315 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000046#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000047#include <windows.h>
48#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000049
Larry Hastings61272b72014-01-07 12:41:53 -080050/*[clinic input]
Larry Hastingsc2047262014-01-25 20:43:29 -080051class str "PyUnicodeObject *" "&PyUnicode_Type"
Larry Hastings61272b72014-01-07 12:41:53 -080052[clinic start generated code]*/
Larry Hastings581ee362014-01-28 05:00:08 -080053/*[clinic end generated code: output=da39a3ee5e6b4b0d input=604e916854800fa8]*/
Larry Hastings44e2eaa2013-11-23 15:37:55 -080054
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000055/* --- Globals ------------------------------------------------------------
56
Serhiy Storchaka05997252013-01-26 12:14:02 +020057NOTE: In the interpreter's initialization phase, some globals are currently
58 initialized dynamically as needed. In the process Unicode objects may
59 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000060
61*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000062
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000063
64#ifdef __cplusplus
65extern "C" {
66#endif
67
Victor Stinner8faf8212011-12-08 22:14:11 +010068/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
69#define MAX_UNICODE 0x10ffff
70
Victor Stinner910337b2011-10-03 03:20:16 +020071#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020072# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020073#else
74# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
75#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020076
Victor Stinnere90fe6a2011-10-01 16:48:13 +020077#define _PyUnicode_UTF8(op) \
78 (((PyCompactUnicodeObject*)(op))->utf8)
79#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020080 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020081 assert(PyUnicode_IS_READY(op)), \
82 PyUnicode_IS_COMPACT_ASCII(op) ? \
83 ((char*)((PyASCIIObject*)(op) + 1)) : \
84 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +020085#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020086 (((PyCompactUnicodeObject*)(op))->utf8_length)
87#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020088 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020089 assert(PyUnicode_IS_READY(op)), \
90 PyUnicode_IS_COMPACT_ASCII(op) ? \
91 ((PyASCIIObject*)(op))->length : \
92 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +020093#define _PyUnicode_WSTR(op) \
94 (((PyASCIIObject*)(op))->wstr)
95#define _PyUnicode_WSTR_LENGTH(op) \
96 (((PyCompactUnicodeObject*)(op))->wstr_length)
97#define _PyUnicode_LENGTH(op) \
98 (((PyASCIIObject *)(op))->length)
99#define _PyUnicode_STATE(op) \
100 (((PyASCIIObject *)(op))->state)
101#define _PyUnicode_HASH(op) \
102 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200103#define _PyUnicode_KIND(op) \
104 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200105 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200106#define _PyUnicode_GET_LENGTH(op) \
107 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200108 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200109#define _PyUnicode_DATA_ANY(op) \
110 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200111
Victor Stinner910337b2011-10-03 03:20:16 +0200112#undef PyUnicode_READY
113#define PyUnicode_READY(op) \
114 (assert(_PyUnicode_CHECK(op)), \
115 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200116 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100117 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200118
Victor Stinnerc379ead2011-10-03 12:52:27 +0200119#define _PyUnicode_SHARE_UTF8(op) \
120 (assert(_PyUnicode_CHECK(op)), \
121 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
122 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
123#define _PyUnicode_SHARE_WSTR(op) \
124 (assert(_PyUnicode_CHECK(op)), \
125 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
126
Victor Stinner829c0ad2011-10-03 01:08:02 +0200127/* true if the Unicode object has an allocated UTF-8 memory block
128 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200129#define _PyUnicode_HAS_UTF8_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200130 ((!PyUnicode_IS_COMPACT_ASCII(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200131 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200132 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
133
Victor Stinner03490912011-10-03 23:45:12 +0200134/* true if the Unicode object has an allocated wstr memory block
135 (not shared with other data) */
136#define _PyUnicode_HAS_WSTR_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200137 ((_PyUnicode_WSTR(op) && \
Victor Stinner03490912011-10-03 23:45:12 +0200138 (!PyUnicode_IS_READY(op) || \
139 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
140
Victor Stinner910337b2011-10-03 03:20:16 +0200141/* Generic helper macro to convert characters of different types.
142 from_type and to_type have to be valid type names, begin and end
143 are pointers to the source characters which should be of type
144 "from_type *". to is a pointer of type "to_type *" and points to the
145 buffer where the result characters are written to. */
146#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
147 do { \
Victor Stinner4a587072013-11-19 12:54:53 +0100148 to_type *_to = (to_type *)(to); \
149 const from_type *_iter = (from_type *)(begin); \
150 const from_type *_end = (from_type *)(end); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200151 Py_ssize_t n = (_end) - (_iter); \
152 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200153 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200154 while (_iter < (_unrolled_end)) { \
155 _to[0] = (to_type) _iter[0]; \
156 _to[1] = (to_type) _iter[1]; \
157 _to[2] = (to_type) _iter[2]; \
158 _to[3] = (to_type) _iter[3]; \
159 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200160 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200161 while (_iter < (_end)) \
162 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200163 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200164
Walter Dörwald16807132007-05-25 13:52:07 +0000165/* This dictionary holds all interned unicode strings. Note that references
166 to strings in this dictionary are *not* counted in the string's ob_refcnt.
167 When the interned string reaches a refcnt of 0 the string deallocation
168 function will delete the reference from this dictionary.
169
170 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000171 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000172*/
Serhiy Storchaka05997252013-01-26 12:14:02 +0200173static PyObject *interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +0000174
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000175/* The empty Unicode object is shared to improve performance. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200176static PyObject *unicode_empty = NULL;
Serhiy Storchaka05997252013-01-26 12:14:02 +0200177
Serhiy Storchaka678db842013-01-26 12:16:36 +0200178#define _Py_INCREF_UNICODE_EMPTY() \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200179 do { \
180 if (unicode_empty != NULL) \
181 Py_INCREF(unicode_empty); \
182 else { \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200183 unicode_empty = PyUnicode_New(0, 0); \
184 if (unicode_empty != NULL) { \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200185 Py_INCREF(unicode_empty); \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200186 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
187 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200188 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200189 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000190
Serhiy Storchaka678db842013-01-26 12:16:36 +0200191#define _Py_RETURN_UNICODE_EMPTY() \
192 do { \
193 _Py_INCREF_UNICODE_EMPTY(); \
194 return unicode_empty; \
195 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000196
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200197/* Forward declaration */
198Py_LOCAL_INLINE(int)
199_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
200
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200201/* List of static strings. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200202static _Py_Identifier *static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200203
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000204/* Single character Unicode strings in the Latin-1 range are being
205 shared as well. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200206static PyObject *unicode_latin1[256] = {NULL};
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000207
Christian Heimes190d79e2008-01-30 11:58:22 +0000208/* Fast detection of the most frequent whitespace characters */
209const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000210 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000211/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000212/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000213/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000214/* case 0x000C: * FORM FEED */
215/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000216 0, 1, 1, 1, 1, 1, 0, 0,
217 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000218/* case 0x001C: * FILE SEPARATOR */
219/* case 0x001D: * GROUP SEPARATOR */
220/* case 0x001E: * RECORD SEPARATOR */
221/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000222 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000223/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000224 1, 0, 0, 0, 0, 0, 0, 0,
225 0, 0, 0, 0, 0, 0, 0, 0,
226 0, 0, 0, 0, 0, 0, 0, 0,
227 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000228
Benjamin Peterson14339b62009-01-31 16:36:08 +0000229 0, 0, 0, 0, 0, 0, 0, 0,
230 0, 0, 0, 0, 0, 0, 0, 0,
231 0, 0, 0, 0, 0, 0, 0, 0,
232 0, 0, 0, 0, 0, 0, 0, 0,
233 0, 0, 0, 0, 0, 0, 0, 0,
234 0, 0, 0, 0, 0, 0, 0, 0,
235 0, 0, 0, 0, 0, 0, 0, 0,
236 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000237};
238
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200239/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200240static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200241static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100242static int unicode_modifiable(PyObject *unicode);
243
Victor Stinnerfe226c02011-10-03 03:52:20 +0200244
Alexander Belopolsky40018472011-02-26 01:02:56 +0000245static PyObject *
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100246_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200247static PyObject *
248_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
249static PyObject *
250_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
251
252static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000253unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000254 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100255 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000256 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
257
Alexander Belopolsky40018472011-02-26 01:02:56 +0000258static void
259raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300260 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100261 PyObject *unicode,
262 Py_ssize_t startpos, Py_ssize_t endpos,
263 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000264
Christian Heimes190d79e2008-01-30 11:58:22 +0000265/* Same for linebreaks */
266static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000267 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000268/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000269/* 0x000B, * LINE TABULATION */
270/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000271/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000272 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000273 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000274/* 0x001C, * FILE SEPARATOR */
275/* 0x001D, * GROUP SEPARATOR */
276/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000277 0, 0, 0, 0, 1, 1, 1, 0,
278 0, 0, 0, 0, 0, 0, 0, 0,
279 0, 0, 0, 0, 0, 0, 0, 0,
280 0, 0, 0, 0, 0, 0, 0, 0,
281 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000282
Benjamin Peterson14339b62009-01-31 16:36:08 +0000283 0, 0, 0, 0, 0, 0, 0, 0,
284 0, 0, 0, 0, 0, 0, 0, 0,
285 0, 0, 0, 0, 0, 0, 0, 0,
286 0, 0, 0, 0, 0, 0, 0, 0,
287 0, 0, 0, 0, 0, 0, 0, 0,
288 0, 0, 0, 0, 0, 0, 0, 0,
289 0, 0, 0, 0, 0, 0, 0, 0,
290 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000291};
292
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300293/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
294 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000295Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000296PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000297{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000298#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000299 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000300#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000301 /* This is actually an illegal character, so it should
302 not be passed to unichr. */
303 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000304#endif
305}
306
Victor Stinner910337b2011-10-03 03:20:16 +0200307#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200308int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100309_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200310{
311 PyASCIIObject *ascii;
312 unsigned int kind;
313
314 assert(PyUnicode_Check(op));
315
316 ascii = (PyASCIIObject *)op;
317 kind = ascii->state.kind;
318
Victor Stinnera3b334d2011-10-03 13:53:37 +0200319 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200320 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200321 assert(ascii->state.ready == 1);
322 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200323 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200324 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200325 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200326
Victor Stinnera41463c2011-10-04 01:05:08 +0200327 if (ascii->state.compact == 1) {
328 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200329 assert(kind == PyUnicode_1BYTE_KIND
330 || kind == PyUnicode_2BYTE_KIND
331 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200332 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200333 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200334 assert (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100335 }
336 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200337 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
338
339 data = unicode->data.any;
340 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnere30c0a12011-11-04 20:54:05 +0100341 assert(ascii->length == 0);
342 assert(ascii->hash == -1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200343 assert(ascii->state.compact == 0);
344 assert(ascii->state.ascii == 0);
345 assert(ascii->state.ready == 0);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100346 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
Victor Stinnera41463c2011-10-04 01:05:08 +0200347 assert(ascii->wstr != NULL);
348 assert(data == NULL);
349 assert(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200350 }
351 else {
352 assert(kind == PyUnicode_1BYTE_KIND
353 || kind == PyUnicode_2BYTE_KIND
354 || kind == PyUnicode_4BYTE_KIND);
355 assert(ascii->state.compact == 0);
356 assert(ascii->state.ready == 1);
357 assert(data != NULL);
358 if (ascii->state.ascii) {
359 assert (compact->utf8 == data);
360 assert (compact->utf8_length == ascii->length);
361 }
362 else
363 assert (compact->utf8 != data);
364 }
365 }
366 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200367 if (
368#if SIZEOF_WCHAR_T == 2
369 kind == PyUnicode_2BYTE_KIND
370#else
371 kind == PyUnicode_4BYTE_KIND
372#endif
373 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200374 {
375 assert(ascii->wstr == data);
376 assert(compact->wstr_length == ascii->length);
377 } else
378 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200379 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200380
381 if (compact->utf8 == NULL)
382 assert(compact->utf8_length == 0);
383 if (ascii->wstr == NULL)
384 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200385 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200386 /* check that the best kind is used */
387 if (check_content && kind != PyUnicode_WCHAR_KIND)
388 {
389 Py_ssize_t i;
390 Py_UCS4 maxchar = 0;
Victor Stinner718fbf02012-04-26 00:39:37 +0200391 void *data;
392 Py_UCS4 ch;
393
394 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200395 for (i=0; i < ascii->length; i++)
396 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200397 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200398 if (ch > maxchar)
399 maxchar = ch;
400 }
401 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100402 if (ascii->state.ascii == 0) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200403 assert(maxchar >= 128);
Victor Stinner77faf692011-11-20 18:56:05 +0100404 assert(maxchar <= 255);
405 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200406 else
407 assert(maxchar < 128);
408 }
Victor Stinner77faf692011-11-20 18:56:05 +0100409 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200410 assert(maxchar >= 0x100);
Victor Stinner77faf692011-11-20 18:56:05 +0100411 assert(maxchar <= 0xFFFF);
412 }
413 else {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200414 assert(maxchar >= 0x10000);
Victor Stinner8faf8212011-12-08 22:14:11 +0100415 assert(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100416 }
Victor Stinner718fbf02012-04-26 00:39:37 +0200417 assert(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200418 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400419 return 1;
420}
Victor Stinner910337b2011-10-03 03:20:16 +0200421#endif
422
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100423static PyObject*
424unicode_result_wchar(PyObject *unicode)
425{
426#ifndef Py_DEBUG
427 Py_ssize_t len;
428
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100429 len = _PyUnicode_WSTR_LENGTH(unicode);
430 if (len == 0) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100431 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200432 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100433 }
434
435 if (len == 1) {
436 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100437 if ((Py_UCS4)ch < 256) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100438 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
439 Py_DECREF(unicode);
440 return latin1_char;
441 }
442 }
443
444 if (_PyUnicode_Ready(unicode) < 0) {
Victor Stinneraa771272012-10-04 02:32:58 +0200445 Py_DECREF(unicode);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100446 return NULL;
447 }
448#else
Victor Stinneraa771272012-10-04 02:32:58 +0200449 assert(Py_REFCNT(unicode) == 1);
450
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100451 /* don't make the result ready in debug mode to ensure that the caller
452 makes the string ready before using it */
453 assert(_PyUnicode_CheckConsistency(unicode, 1));
454#endif
455 return unicode;
456}
457
458static PyObject*
459unicode_result_ready(PyObject *unicode)
460{
461 Py_ssize_t length;
462
463 length = PyUnicode_GET_LENGTH(unicode);
464 if (length == 0) {
465 if (unicode != unicode_empty) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100466 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200467 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100468 }
469 return unicode_empty;
470 }
471
472 if (length == 1) {
Victor Stinner69ed0f42013-04-09 21:48:24 +0200473 void *data = PyUnicode_DATA(unicode);
474 int kind = PyUnicode_KIND(unicode);
475 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100476 if (ch < 256) {
477 PyObject *latin1_char = unicode_latin1[ch];
478 if (latin1_char != NULL) {
479 if (unicode != latin1_char) {
480 Py_INCREF(latin1_char);
481 Py_DECREF(unicode);
482 }
483 return latin1_char;
484 }
485 else {
486 assert(_PyUnicode_CheckConsistency(unicode, 1));
487 Py_INCREF(unicode);
488 unicode_latin1[ch] = unicode;
489 return unicode;
490 }
491 }
492 }
493
494 assert(_PyUnicode_CheckConsistency(unicode, 1));
495 return unicode;
496}
497
498static PyObject*
499unicode_result(PyObject *unicode)
500{
501 assert(_PyUnicode_CHECK(unicode));
502 if (PyUnicode_IS_READY(unicode))
503 return unicode_result_ready(unicode);
504 else
505 return unicode_result_wchar(unicode);
506}
507
Victor Stinnerc4b49542011-12-11 22:44:26 +0100508static PyObject*
509unicode_result_unchanged(PyObject *unicode)
510{
511 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500512 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100513 return NULL;
514 Py_INCREF(unicode);
515 return unicode;
516 }
517 else
518 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100519 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100520}
521
Victor Stinner3a50e702011-10-18 21:21:00 +0200522#ifdef HAVE_MBCS
523static OSVERSIONINFOEX winver;
524#endif
525
Thomas Wouters477c8d52006-05-27 19:21:47 +0000526/* --- Bloom Filters ----------------------------------------------------- */
527
528/* stuff to implement simple "bloom filters" for Unicode characters.
529 to keep things simple, we use a single bitmask, using the least 5
530 bits from each unicode characters as the bit index. */
531
532/* the linebreak mask is set up by Unicode_Init below */
533
Antoine Pitrouf068f942010-01-13 14:19:12 +0000534#if LONG_BIT >= 128
535#define BLOOM_WIDTH 128
536#elif LONG_BIT >= 64
537#define BLOOM_WIDTH 64
538#elif LONG_BIT >= 32
539#define BLOOM_WIDTH 32
540#else
541#error "LONG_BIT is smaller than 32"
542#endif
543
Thomas Wouters477c8d52006-05-27 19:21:47 +0000544#define BLOOM_MASK unsigned long
545
Serhiy Storchaka05997252013-01-26 12:14:02 +0200546static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000547
Antoine Pitrouf068f942010-01-13 14:19:12 +0000548#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000549
Benjamin Peterson29060642009-01-31 22:14:21 +0000550#define BLOOM_LINEBREAK(ch) \
551 ((ch) < 128U ? ascii_linebreak[(ch)] : \
552 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000553
Alexander Belopolsky40018472011-02-26 01:02:56 +0000554Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200555make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000556{
Victor Stinnera85af502013-04-09 21:53:54 +0200557#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
558 do { \
559 TYPE *data = (TYPE *)PTR; \
560 TYPE *end = data + LEN; \
561 Py_UCS4 ch; \
562 for (; data != end; data++) { \
563 ch = *data; \
564 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
565 } \
566 break; \
567 } while (0)
568
Thomas Wouters477c8d52006-05-27 19:21:47 +0000569 /* calculate simple bloom-style bitmask for a given unicode string */
570
Antoine Pitrouf068f942010-01-13 14:19:12 +0000571 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000572
573 mask = 0;
Victor Stinnera85af502013-04-09 21:53:54 +0200574 switch (kind) {
575 case PyUnicode_1BYTE_KIND:
576 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
577 break;
578 case PyUnicode_2BYTE_KIND:
579 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
580 break;
581 case PyUnicode_4BYTE_KIND:
582 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
583 break;
584 default:
585 assert(0);
586 }
Thomas Wouters477c8d52006-05-27 19:21:47 +0000587 return mask;
Victor Stinnera85af502013-04-09 21:53:54 +0200588
589#undef BLOOM_UPDATE
Thomas Wouters477c8d52006-05-27 19:21:47 +0000590}
591
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200592/* Compilation of templated routines */
593
594#include "stringlib/asciilib.h"
595#include "stringlib/fastsearch.h"
596#include "stringlib/partition.h"
597#include "stringlib/split.h"
598#include "stringlib/count.h"
599#include "stringlib/find.h"
600#include "stringlib/find_max_char.h"
601#include "stringlib/localeutil.h"
602#include "stringlib/undef.h"
603
604#include "stringlib/ucs1lib.h"
605#include "stringlib/fastsearch.h"
606#include "stringlib/partition.h"
607#include "stringlib/split.h"
608#include "stringlib/count.h"
609#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300610#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200611#include "stringlib/find_max_char.h"
612#include "stringlib/localeutil.h"
613#include "stringlib/undef.h"
614
615#include "stringlib/ucs2lib.h"
616#include "stringlib/fastsearch.h"
617#include "stringlib/partition.h"
618#include "stringlib/split.h"
619#include "stringlib/count.h"
620#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300621#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200622#include "stringlib/find_max_char.h"
623#include "stringlib/localeutil.h"
624#include "stringlib/undef.h"
625
626#include "stringlib/ucs4lib.h"
627#include "stringlib/fastsearch.h"
628#include "stringlib/partition.h"
629#include "stringlib/split.h"
630#include "stringlib/count.h"
631#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300632#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200633#include "stringlib/find_max_char.h"
634#include "stringlib/localeutil.h"
635#include "stringlib/undef.h"
636
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200637#include "stringlib/unicodedefs.h"
638#include "stringlib/fastsearch.h"
639#include "stringlib/count.h"
640#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100641#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200642
Guido van Rossumd57fd912000-03-10 22:53:23 +0000643/* --- Unicode Object ----------------------------------------------------- */
644
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200645static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200646fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200647
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200648Py_LOCAL_INLINE(Py_ssize_t) findchar(void *s, int kind,
649 Py_ssize_t size, Py_UCS4 ch,
650 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200651{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200652 int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH;
653
654 switch (kind) {
655 case PyUnicode_1BYTE_KIND:
656 {
657 Py_UCS1 ch1 = (Py_UCS1) ch;
658 if (ch1 == ch)
659 return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode);
660 else
661 return -1;
662 }
663 case PyUnicode_2BYTE_KIND:
664 {
665 Py_UCS2 ch2 = (Py_UCS2) ch;
666 if (ch2 == ch)
667 return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode);
668 else
669 return -1;
670 }
671 case PyUnicode_4BYTE_KIND:
672 return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode);
673 default:
674 assert(0);
675 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200676 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200677}
678
Victor Stinnerafffce42012-10-03 23:03:17 +0200679#ifdef Py_DEBUG
680/* Fill the data of an Unicode string with invalid characters to detect bugs
681 earlier.
682
683 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
684 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
685 invalid character in Unicode 6.0. */
686static void
687unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
688{
689 int kind = PyUnicode_KIND(unicode);
690 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
691 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
692 if (length <= old_length)
693 return;
694 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
695}
696#endif
697
Victor Stinnerfe226c02011-10-03 03:52:20 +0200698static PyObject*
699resize_compact(PyObject *unicode, Py_ssize_t length)
700{
701 Py_ssize_t char_size;
702 Py_ssize_t struct_size;
703 Py_ssize_t new_size;
704 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100705 PyObject *new_unicode;
Victor Stinnerafffce42012-10-03 23:03:17 +0200706#ifdef Py_DEBUG
707 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
708#endif
709
Victor Stinner79891572012-05-03 13:43:07 +0200710 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200711 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +0100712 assert(PyUnicode_IS_COMPACT(unicode));
713
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200714 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100715 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +0200716 struct_size = sizeof(PyASCIIObject);
717 else
718 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200719 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200720
Victor Stinnerfe226c02011-10-03 03:52:20 +0200721 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
722 PyErr_NoMemory();
723 return NULL;
724 }
725 new_size = (struct_size + (length + 1) * char_size);
726
Victor Stinner84def372011-12-11 20:04:56 +0100727 _Py_DEC_REFTOTAL;
728 _Py_ForgetReference(unicode);
729
730 new_unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
731 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100732 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200733 PyErr_NoMemory();
734 return NULL;
735 }
Victor Stinner84def372011-12-11 20:04:56 +0100736 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200737 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +0100738
Victor Stinnerfe226c02011-10-03 03:52:20 +0200739 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200740 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200741 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100742 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +0200743 _PyUnicode_WSTR_LENGTH(unicode) = length;
744 }
Victor Stinnerbbbac2e2013-02-07 23:12:46 +0100745 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
746 PyObject_DEL(_PyUnicode_WSTR(unicode));
747 _PyUnicode_WSTR(unicode) = NULL;
748 }
Victor Stinnerafffce42012-10-03 23:03:17 +0200749#ifdef Py_DEBUG
750 unicode_fill_invalid(unicode, old_length);
751#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200752 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
753 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +0200754 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200755 return unicode;
756}
757
Alexander Belopolsky40018472011-02-26 01:02:56 +0000758static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200759resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000760{
Victor Stinner95663112011-10-04 01:03:50 +0200761 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100762 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200763 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200764 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000765
Victor Stinnerfe226c02011-10-03 03:52:20 +0200766 if (PyUnicode_IS_READY(unicode)) {
767 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200768 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200769 void *data;
Victor Stinnerafffce42012-10-03 23:03:17 +0200770#ifdef Py_DEBUG
771 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
772#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200773
774 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200775 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200776 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
777 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200778
779 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
780 PyErr_NoMemory();
781 return -1;
782 }
783 new_size = (length + 1) * char_size;
784
Victor Stinner7a9105a2011-12-12 00:13:42 +0100785 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
786 {
787 PyObject_DEL(_PyUnicode_UTF8(unicode));
788 _PyUnicode_UTF8(unicode) = NULL;
789 _PyUnicode_UTF8_LENGTH(unicode) = 0;
790 }
791
Victor Stinnerfe226c02011-10-03 03:52:20 +0200792 data = (PyObject *)PyObject_REALLOC(data, new_size);
793 if (data == NULL) {
794 PyErr_NoMemory();
795 return -1;
796 }
797 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200798 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200799 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200800 _PyUnicode_WSTR_LENGTH(unicode) = length;
801 }
802 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200803 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200804 _PyUnicode_UTF8_LENGTH(unicode) = length;
805 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200806 _PyUnicode_LENGTH(unicode) = length;
807 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinnerafffce42012-10-03 23:03:17 +0200808#ifdef Py_DEBUG
809 unicode_fill_invalid(unicode, old_length);
810#endif
Victor Stinner95663112011-10-04 01:03:50 +0200811 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200812 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200813 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200814 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200815 }
Victor Stinner95663112011-10-04 01:03:50 +0200816 assert(_PyUnicode_WSTR(unicode) != NULL);
817
818 /* check for integer overflow */
819 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
820 PyErr_NoMemory();
821 return -1;
822 }
Victor Stinner7a9105a2011-12-12 00:13:42 +0100823 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +0200824 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +0100825 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +0200826 if (!wstr) {
827 PyErr_NoMemory();
828 return -1;
829 }
830 _PyUnicode_WSTR(unicode) = wstr;
831 _PyUnicode_WSTR(unicode)[length] = 0;
832 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200833 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000834 return 0;
835}
836
Victor Stinnerfe226c02011-10-03 03:52:20 +0200837static PyObject*
838resize_copy(PyObject *unicode, Py_ssize_t length)
839{
840 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100841 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200842 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100843
Benjamin Petersonbac79492012-01-14 13:34:47 -0500844 if (PyUnicode_READY(unicode) == -1)
Victor Stinner7a9105a2011-12-12 00:13:42 +0100845 return NULL;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200846
847 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
848 if (copy == NULL)
849 return NULL;
850
851 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +0200852 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200853 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200854 }
855 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200856 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100857
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200858 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200859 if (w == NULL)
860 return NULL;
861 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
862 copy_length = Py_MIN(copy_length, length);
Victor Stinnerc6cf1ba2012-10-23 02:54:47 +0200863 Py_MEMCPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
864 copy_length * sizeof(wchar_t));
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200865 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200866 }
867}
868
Guido van Rossumd57fd912000-03-10 22:53:23 +0000869/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000870 Ux0000 terminated; some code (e.g. new_identifier)
871 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000872
873 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000874 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000875
876*/
877
Alexander Belopolsky40018472011-02-26 01:02:56 +0000878static PyUnicodeObject *
879_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000880{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +0200881 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200882 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000883
Thomas Wouters477c8d52006-05-27 19:21:47 +0000884 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000885 if (length == 0 && unicode_empty != NULL) {
886 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200887 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000888 }
889
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000890 /* Ensure we won't overflow the size. */
891 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
892 return (PyUnicodeObject *)PyErr_NoMemory();
893 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200894 if (length < 0) {
895 PyErr_SetString(PyExc_SystemError,
896 "Negative size passed to _PyUnicode_New");
897 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000898 }
899
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200900 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
901 if (unicode == NULL)
902 return NULL;
903 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
Victor Stinner68b674c2013-10-29 19:31:43 +0100904
905 _PyUnicode_WSTR_LENGTH(unicode) = length;
906 _PyUnicode_HASH(unicode) = -1;
907 _PyUnicode_STATE(unicode).interned = 0;
908 _PyUnicode_STATE(unicode).kind = 0;
909 _PyUnicode_STATE(unicode).compact = 0;
910 _PyUnicode_STATE(unicode).ready = 0;
911 _PyUnicode_STATE(unicode).ascii = 0;
912 _PyUnicode_DATA_ANY(unicode) = NULL;
913 _PyUnicode_LENGTH(unicode) = 0;
914 _PyUnicode_UTF8(unicode) = NULL;
915 _PyUnicode_UTF8_LENGTH(unicode) = 0;
916
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200917 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
918 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100919 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +0000920 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100921 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000922 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200923
Jeremy Hyltond8082792003-09-16 19:41:39 +0000924 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000925 * the caller fails before initializing str -- unicode_resize()
926 * reads str[0], and the Keep-Alive optimization can keep memory
927 * allocated for str alive across a call to unicode_dealloc(unicode).
928 * We don't want unicode_resize to read uninitialized memory in
929 * that case.
930 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200931 _PyUnicode_WSTR(unicode)[0] = 0;
932 _PyUnicode_WSTR(unicode)[length] = 0;
Victor Stinner68b674c2013-10-29 19:31:43 +0100933
Victor Stinner7931d9a2011-11-04 00:22:48 +0100934 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000935 return unicode;
936}
937
Victor Stinnerf42dc442011-10-02 23:33:16 +0200938static const char*
939unicode_kind_name(PyObject *unicode)
940{
Victor Stinner42dfd712011-10-03 14:41:45 +0200941 /* don't check consistency: unicode_kind_name() is called from
942 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +0200943 if (!PyUnicode_IS_COMPACT(unicode))
944 {
945 if (!PyUnicode_IS_READY(unicode))
946 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -0600947 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200948 {
949 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200950 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200951 return "legacy ascii";
952 else
953 return "legacy latin1";
954 case PyUnicode_2BYTE_KIND:
955 return "legacy UCS2";
956 case PyUnicode_4BYTE_KIND:
957 return "legacy UCS4";
958 default:
959 return "<legacy invalid kind>";
960 }
961 }
962 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -0600963 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +0200964 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200965 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200966 return "ascii";
967 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200968 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200969 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200970 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200971 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200972 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200973 default:
974 return "<invalid compact kind>";
975 }
976}
977
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200978#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200979/* Functions wrapping macros for use in debugger */
980char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200981 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200982}
983
984void *_PyUnicode_compact_data(void *unicode) {
985 return _PyUnicode_COMPACT_DATA(unicode);
986}
987void *_PyUnicode_data(void *unicode){
988 printf("obj %p\n", unicode);
989 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
990 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
991 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
992 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
993 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
994 return PyUnicode_DATA(unicode);
995}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200996
997void
998_PyUnicode_Dump(PyObject *op)
999{
1000 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +02001001 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1002 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
1003 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +02001004
Victor Stinnera849a4b2011-10-03 12:12:11 +02001005 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +02001006 {
1007 if (ascii->state.ascii)
1008 data = (ascii + 1);
1009 else
1010 data = (compact + 1);
1011 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001012 else
1013 data = unicode->data.any;
Victor Stinner293f3f52014-07-01 08:57:10 +02001014 printf("%s: len=%" PY_FORMAT_SIZE_T "u, ",
1015 unicode_kind_name(op), ascii->length);
Victor Stinner0d60e872011-10-23 19:47:19 +02001016
Victor Stinnera849a4b2011-10-03 12:12:11 +02001017 if (ascii->wstr == data)
1018 printf("shared ");
1019 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +02001020
Victor Stinnera3b334d2011-10-03 13:53:37 +02001021 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinner293f3f52014-07-01 08:57:10 +02001022 printf(" (%" PY_FORMAT_SIZE_T "u), ", compact->wstr_length);
Victor Stinnera849a4b2011-10-03 12:12:11 +02001023 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
1024 printf("shared ");
Victor Stinner293f3f52014-07-01 08:57:10 +02001025 printf("utf8=%p (%" PY_FORMAT_SIZE_T "u)",
1026 compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001027 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001028 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001029}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001030#endif
1031
1032PyObject *
1033PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1034{
1035 PyObject *obj;
1036 PyCompactUnicodeObject *unicode;
1037 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +02001038 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001039 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001040 Py_ssize_t char_size;
1041 Py_ssize_t struct_size;
1042
1043 /* Optimization for empty strings */
1044 if (size == 0 && unicode_empty != NULL) {
1045 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001046 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001047 }
1048
Victor Stinner9e9d6892011-10-04 01:02:02 +02001049 is_ascii = 0;
1050 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001051 struct_size = sizeof(PyCompactUnicodeObject);
1052 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +02001053 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001054 char_size = 1;
1055 is_ascii = 1;
1056 struct_size = sizeof(PyASCIIObject);
1057 }
1058 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001059 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001060 char_size = 1;
1061 }
1062 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001063 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001064 char_size = 2;
1065 if (sizeof(wchar_t) == 2)
1066 is_sharing = 1;
1067 }
1068 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001069 if (maxchar > MAX_UNICODE) {
1070 PyErr_SetString(PyExc_SystemError,
1071 "invalid maximum character passed to PyUnicode_New");
1072 return NULL;
1073 }
Victor Stinner8f825062012-04-27 13:55:39 +02001074 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001075 char_size = 4;
1076 if (sizeof(wchar_t) == 4)
1077 is_sharing = 1;
1078 }
1079
1080 /* Ensure we won't overflow the size. */
1081 if (size < 0) {
1082 PyErr_SetString(PyExc_SystemError,
1083 "Negative size passed to PyUnicode_New");
1084 return NULL;
1085 }
1086 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1087 return PyErr_NoMemory();
1088
1089 /* Duplicated allocation code from _PyObject_New() instead of a call to
1090 * PyObject_New() so we are able to allocate space for the object and
1091 * it's data buffer.
1092 */
1093 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1094 if (obj == NULL)
1095 return PyErr_NoMemory();
1096 obj = PyObject_INIT(obj, &PyUnicode_Type);
1097 if (obj == NULL)
1098 return NULL;
1099
1100 unicode = (PyCompactUnicodeObject *)obj;
1101 if (is_ascii)
1102 data = ((PyASCIIObject*)obj) + 1;
1103 else
1104 data = unicode + 1;
1105 _PyUnicode_LENGTH(unicode) = size;
1106 _PyUnicode_HASH(unicode) = -1;
1107 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001108 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001109 _PyUnicode_STATE(unicode).compact = 1;
1110 _PyUnicode_STATE(unicode).ready = 1;
1111 _PyUnicode_STATE(unicode).ascii = is_ascii;
1112 if (is_ascii) {
1113 ((char*)data)[size] = 0;
1114 _PyUnicode_WSTR(unicode) = NULL;
1115 }
Victor Stinner8f825062012-04-27 13:55:39 +02001116 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001117 ((char*)data)[size] = 0;
1118 _PyUnicode_WSTR(unicode) = NULL;
1119 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001120 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001121 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001122 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001123 else {
1124 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001125 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001126 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001127 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001128 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001129 ((Py_UCS4*)data)[size] = 0;
1130 if (is_sharing) {
1131 _PyUnicode_WSTR_LENGTH(unicode) = size;
1132 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1133 }
1134 else {
1135 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1136 _PyUnicode_WSTR(unicode) = NULL;
1137 }
1138 }
Victor Stinner8f825062012-04-27 13:55:39 +02001139#ifdef Py_DEBUG
Victor Stinnerafffce42012-10-03 23:03:17 +02001140 unicode_fill_invalid((PyObject*)unicode, 0);
Victor Stinner8f825062012-04-27 13:55:39 +02001141#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001142 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001143 return obj;
1144}
1145
1146#if SIZEOF_WCHAR_T == 2
1147/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1148 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001149 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001150
1151 This function assumes that unicode can hold one more code point than wstr
1152 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001153static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001154unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001155 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001156{
1157 const wchar_t *iter;
1158 Py_UCS4 *ucs4_out;
1159
Victor Stinner910337b2011-10-03 03:20:16 +02001160 assert(unicode != NULL);
1161 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001162 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1163 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1164
1165 for (iter = begin; iter < end; ) {
1166 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1167 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001168 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1169 && (iter+1) < end
1170 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001171 {
Victor Stinner551ac952011-11-29 22:58:13 +01001172 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001173 iter += 2;
1174 }
1175 else {
1176 *ucs4_out++ = *iter;
1177 iter++;
1178 }
1179 }
1180 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1181 _PyUnicode_GET_LENGTH(unicode)));
1182
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001183}
1184#endif
1185
Victor Stinnercd9950f2011-10-02 00:34:53 +02001186static int
Victor Stinner488fa492011-12-12 00:01:39 +01001187unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001188{
Victor Stinner488fa492011-12-12 00:01:39 +01001189 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001190 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001191 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001192 return -1;
1193 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001194 return 0;
1195}
1196
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001197static int
1198_copy_characters(PyObject *to, Py_ssize_t to_start,
1199 PyObject *from, Py_ssize_t from_start,
1200 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001201{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001202 unsigned int from_kind, to_kind;
1203 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001204
Victor Stinneree4544c2012-05-09 22:24:08 +02001205 assert(0 <= how_many);
1206 assert(0 <= from_start);
1207 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001208 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001209 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001210 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001211
Victor Stinnerd3f08822012-05-29 12:57:52 +02001212 assert(PyUnicode_Check(to));
1213 assert(PyUnicode_IS_READY(to));
1214 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1215
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001216 if (how_many == 0)
1217 return 0;
1218
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001219 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001220 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001221 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001222 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001223
Victor Stinnerf1852262012-06-16 16:38:26 +02001224#ifdef Py_DEBUG
1225 if (!check_maxchar
1226 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1227 {
1228 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1229 Py_UCS4 ch;
1230 Py_ssize_t i;
1231 for (i=0; i < how_many; i++) {
1232 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1233 assert(ch <= to_maxchar);
1234 }
1235 }
1236#endif
1237
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001238 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001239 if (check_maxchar
1240 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1241 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001242 /* Writing Latin-1 characters into an ASCII string requires to
1243 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001244 Py_UCS4 max_char;
1245 max_char = ucs1lib_find_max_char(from_data,
1246 (Py_UCS1*)from_data + how_many);
1247 if (max_char >= 128)
1248 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001249 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001250 Py_MEMCPY((char*)to_data + to_kind * to_start,
1251 (char*)from_data + from_kind * from_start,
1252 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001253 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001254 else if (from_kind == PyUnicode_1BYTE_KIND
1255 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001256 {
1257 _PyUnicode_CONVERT_BYTES(
1258 Py_UCS1, Py_UCS2,
1259 PyUnicode_1BYTE_DATA(from) + from_start,
1260 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1261 PyUnicode_2BYTE_DATA(to) + to_start
1262 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001263 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001264 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001265 && to_kind == PyUnicode_4BYTE_KIND)
1266 {
1267 _PyUnicode_CONVERT_BYTES(
1268 Py_UCS1, Py_UCS4,
1269 PyUnicode_1BYTE_DATA(from) + from_start,
1270 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1271 PyUnicode_4BYTE_DATA(to) + to_start
1272 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001273 }
1274 else if (from_kind == PyUnicode_2BYTE_KIND
1275 && to_kind == PyUnicode_4BYTE_KIND)
1276 {
1277 _PyUnicode_CONVERT_BYTES(
1278 Py_UCS2, Py_UCS4,
1279 PyUnicode_2BYTE_DATA(from) + from_start,
1280 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1281 PyUnicode_4BYTE_DATA(to) + to_start
1282 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001283 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001284 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001285 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1286
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001287 if (!check_maxchar) {
1288 if (from_kind == PyUnicode_2BYTE_KIND
1289 && to_kind == PyUnicode_1BYTE_KIND)
1290 {
1291 _PyUnicode_CONVERT_BYTES(
1292 Py_UCS2, Py_UCS1,
1293 PyUnicode_2BYTE_DATA(from) + from_start,
1294 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1295 PyUnicode_1BYTE_DATA(to) + to_start
1296 );
1297 }
1298 else if (from_kind == PyUnicode_4BYTE_KIND
1299 && to_kind == PyUnicode_1BYTE_KIND)
1300 {
1301 _PyUnicode_CONVERT_BYTES(
1302 Py_UCS4, Py_UCS1,
1303 PyUnicode_4BYTE_DATA(from) + from_start,
1304 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1305 PyUnicode_1BYTE_DATA(to) + to_start
1306 );
1307 }
1308 else if (from_kind == PyUnicode_4BYTE_KIND
1309 && to_kind == PyUnicode_2BYTE_KIND)
1310 {
1311 _PyUnicode_CONVERT_BYTES(
1312 Py_UCS4, Py_UCS2,
1313 PyUnicode_4BYTE_DATA(from) + from_start,
1314 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1315 PyUnicode_2BYTE_DATA(to) + to_start
1316 );
1317 }
1318 else {
1319 assert(0);
1320 return -1;
1321 }
1322 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001323 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001324 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001325 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001326 Py_ssize_t i;
1327
Victor Stinnera0702ab2011-09-29 14:14:38 +02001328 for (i=0; i < how_many; i++) {
1329 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001330 if (ch > to_maxchar)
1331 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001332 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1333 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001334 }
1335 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001336 return 0;
1337}
1338
Victor Stinnerd3f08822012-05-29 12:57:52 +02001339void
1340_PyUnicode_FastCopyCharacters(
1341 PyObject *to, Py_ssize_t to_start,
1342 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001343{
1344 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1345}
1346
1347Py_ssize_t
1348PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1349 PyObject *from, Py_ssize_t from_start,
1350 Py_ssize_t how_many)
1351{
1352 int err;
1353
1354 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1355 PyErr_BadInternalCall();
1356 return -1;
1357 }
1358
Benjamin Petersonbac79492012-01-14 13:34:47 -05001359 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001360 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001361 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001362 return -1;
1363
Victor Stinnerd3f08822012-05-29 12:57:52 +02001364 if (from_start < 0) {
1365 PyErr_SetString(PyExc_IndexError, "string index out of range");
1366 return -1;
1367 }
1368 if (to_start < 0) {
1369 PyErr_SetString(PyExc_IndexError, "string index out of range");
1370 return -1;
1371 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001372 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1373 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1374 PyErr_Format(PyExc_SystemError,
Victor Stinnera33bce02014-07-04 22:47:46 +02001375 "Cannot write %zi characters at %zi "
1376 "in a string of %zi characters",
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001377 how_many, to_start, PyUnicode_GET_LENGTH(to));
1378 return -1;
1379 }
1380
1381 if (how_many == 0)
1382 return 0;
1383
Victor Stinner488fa492011-12-12 00:01:39 +01001384 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001385 return -1;
1386
1387 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1388 if (err) {
1389 PyErr_Format(PyExc_SystemError,
1390 "Cannot copy %s characters "
1391 "into a string of %s characters",
1392 unicode_kind_name(from),
1393 unicode_kind_name(to));
1394 return -1;
1395 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001396 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001397}
1398
Victor Stinner17222162011-09-28 22:15:37 +02001399/* Find the maximum code point and count the number of surrogate pairs so a
1400 correct string length can be computed before converting a string to UCS4.
1401 This function counts single surrogates as a character and not as a pair.
1402
1403 Return 0 on success, or -1 on error. */
1404static int
1405find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1406 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001407{
1408 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001409 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001410
Victor Stinnerc53be962011-10-02 21:33:54 +02001411 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001412 *num_surrogates = 0;
1413 *maxchar = 0;
1414
1415 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001416#if SIZEOF_WCHAR_T == 2
Victor Stinnercf77da92013-03-06 01:09:24 +01001417 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1418 && (iter+1) < end
1419 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1420 {
1421 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1422 ++(*num_surrogates);
1423 iter += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001424 }
1425 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001426#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001427 {
1428 ch = *iter;
1429 iter++;
1430 }
1431 if (ch > *maxchar) {
1432 *maxchar = ch;
1433 if (*maxchar > MAX_UNICODE) {
1434 PyErr_Format(PyExc_ValueError,
1435 "character U+%x is not in range [U+0000; U+10ffff]",
1436 ch);
1437 return -1;
1438 }
1439 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001440 }
1441 return 0;
1442}
1443
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001444int
1445_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001446{
1447 wchar_t *end;
1448 Py_UCS4 maxchar = 0;
1449 Py_ssize_t num_surrogates;
1450#if SIZEOF_WCHAR_T == 2
1451 Py_ssize_t length_wo_surrogates;
1452#endif
1453
Georg Brandl7597add2011-10-05 16:36:47 +02001454 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001455 strings were created using _PyObject_New() and where no canonical
1456 representation (the str field) has been set yet aka strings
1457 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001458 assert(_PyUnicode_CHECK(unicode));
1459 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001460 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001461 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001462 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001463 /* Actually, it should neither be interned nor be anything else: */
1464 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001465
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001466 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001467 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001468 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001469 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001470
1471 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001472 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1473 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001474 PyErr_NoMemory();
1475 return -1;
1476 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001477 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001478 _PyUnicode_WSTR(unicode), end,
1479 PyUnicode_1BYTE_DATA(unicode));
1480 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1481 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1482 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1483 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001484 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001485 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001486 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001487 }
1488 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001489 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001490 _PyUnicode_UTF8(unicode) = NULL;
1491 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001492 }
1493 PyObject_FREE(_PyUnicode_WSTR(unicode));
1494 _PyUnicode_WSTR(unicode) = NULL;
1495 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1496 }
1497 /* In this case we might have to convert down from 4-byte native
1498 wchar_t to 2-byte unicode. */
1499 else if (maxchar < 65536) {
1500 assert(num_surrogates == 0 &&
1501 "FindMaxCharAndNumSurrogatePairs() messed up");
1502
Victor Stinner506f5922011-09-28 22:34:18 +02001503#if SIZEOF_WCHAR_T == 2
1504 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001505 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001506 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1507 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1508 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001509 _PyUnicode_UTF8(unicode) = NULL;
1510 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001511#else
1512 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001513 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001514 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001515 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001516 PyErr_NoMemory();
1517 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001518 }
Victor Stinner506f5922011-09-28 22:34:18 +02001519 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1520 _PyUnicode_WSTR(unicode), end,
1521 PyUnicode_2BYTE_DATA(unicode));
1522 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1523 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1524 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001525 _PyUnicode_UTF8(unicode) = NULL;
1526 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001527 PyObject_FREE(_PyUnicode_WSTR(unicode));
1528 _PyUnicode_WSTR(unicode) = NULL;
1529 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1530#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001531 }
1532 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1533 else {
1534#if SIZEOF_WCHAR_T == 2
1535 /* in case the native representation is 2-bytes, we need to allocate a
1536 new normalized 4-byte version. */
1537 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001538 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1539 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001540 PyErr_NoMemory();
1541 return -1;
1542 }
1543 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1544 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001545 _PyUnicode_UTF8(unicode) = NULL;
1546 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001547 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1548 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001549 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001550 PyObject_FREE(_PyUnicode_WSTR(unicode));
1551 _PyUnicode_WSTR(unicode) = NULL;
1552 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1553#else
1554 assert(num_surrogates == 0);
1555
Victor Stinnerc3c74152011-10-02 20:39:55 +02001556 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001557 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001558 _PyUnicode_UTF8(unicode) = NULL;
1559 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001560 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1561#endif
1562 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1563 }
1564 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001565 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001566 return 0;
1567}
1568
Alexander Belopolsky40018472011-02-26 01:02:56 +00001569static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001570unicode_dealloc(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001571{
Walter Dörwald16807132007-05-25 13:52:07 +00001572 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001573 case SSTATE_NOT_INTERNED:
1574 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001575
Benjamin Peterson29060642009-01-31 22:14:21 +00001576 case SSTATE_INTERNED_MORTAL:
1577 /* revive dead object temporarily for DelItem */
1578 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001579 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001580 Py_FatalError(
1581 "deletion of interned string failed");
1582 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001583
Benjamin Peterson29060642009-01-31 22:14:21 +00001584 case SSTATE_INTERNED_IMMORTAL:
1585 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001586
Benjamin Peterson29060642009-01-31 22:14:21 +00001587 default:
1588 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001589 }
1590
Victor Stinner03490912011-10-03 23:45:12 +02001591 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001592 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001593 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001594 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001595 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1596 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001597
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001598 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001599}
1600
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001601#ifdef Py_DEBUG
1602static int
1603unicode_is_singleton(PyObject *unicode)
1604{
1605 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1606 if (unicode == unicode_empty)
1607 return 1;
1608 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1609 {
1610 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1611 if (ch < 256 && unicode_latin1[ch] == unicode)
1612 return 1;
1613 }
1614 return 0;
1615}
1616#endif
1617
Alexander Belopolsky40018472011-02-26 01:02:56 +00001618static int
Victor Stinner488fa492011-12-12 00:01:39 +01001619unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001620{
Victor Stinner488fa492011-12-12 00:01:39 +01001621 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001622 if (Py_REFCNT(unicode) != 1)
1623 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001624 if (_PyUnicode_HASH(unicode) != -1)
1625 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001626 if (PyUnicode_CHECK_INTERNED(unicode))
1627 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001628 if (!PyUnicode_CheckExact(unicode))
1629 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001630#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001631 /* singleton refcount is greater than 1 */
1632 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001633#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001634 return 1;
1635}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001636
Victor Stinnerfe226c02011-10-03 03:52:20 +02001637static int
1638unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1639{
1640 PyObject *unicode;
1641 Py_ssize_t old_length;
1642
1643 assert(p_unicode != NULL);
1644 unicode = *p_unicode;
1645
1646 assert(unicode != NULL);
1647 assert(PyUnicode_Check(unicode));
1648 assert(0 <= length);
1649
Victor Stinner910337b2011-10-03 03:20:16 +02001650 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001651 old_length = PyUnicode_WSTR_LENGTH(unicode);
1652 else
1653 old_length = PyUnicode_GET_LENGTH(unicode);
1654 if (old_length == length)
1655 return 0;
1656
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001657 if (length == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02001658 _Py_INCREF_UNICODE_EMPTY();
1659 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00001660 return -1;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001661 Py_DECREF(*p_unicode);
1662 *p_unicode = unicode_empty;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001663 return 0;
1664 }
1665
Victor Stinner488fa492011-12-12 00:01:39 +01001666 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001667 PyObject *copy = resize_copy(unicode, length);
1668 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001669 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001670 Py_DECREF(*p_unicode);
1671 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001672 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001673 }
1674
Victor Stinnerfe226c02011-10-03 03:52:20 +02001675 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001676 PyObject *new_unicode = resize_compact(unicode, length);
1677 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001678 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001679 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001680 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001681 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001682 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001683}
1684
Alexander Belopolsky40018472011-02-26 01:02:56 +00001685int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001686PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001687{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001688 PyObject *unicode;
1689 if (p_unicode == NULL) {
1690 PyErr_BadInternalCall();
1691 return -1;
1692 }
1693 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001694 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001695 {
1696 PyErr_BadInternalCall();
1697 return -1;
1698 }
1699 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001700}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001701
Victor Stinnerc5166102012-02-22 13:55:02 +01001702/* Copy a ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01001703
Victor Stinnerb429d3b2012-02-22 21:22:20 +01001704 WARNING: The function doesn't copy the terminating null character and
1705 doesn't check the maximum character (may write a latin1 character in an
1706 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02001707static void
1708unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1709 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01001710{
1711 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1712 void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02001713 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01001714
1715 switch (kind) {
1716 case PyUnicode_1BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01001717 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner8c6db452012-10-06 00:40:45 +02001718#ifdef Py_DEBUG
1719 if (PyUnicode_IS_ASCII(unicode)) {
1720 Py_UCS4 maxchar = ucs1lib_find_max_char(
1721 (const Py_UCS1*)str,
1722 (const Py_UCS1*)str + len);
1723 assert(maxchar < 128);
1724 }
1725#endif
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01001726 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02001727 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001728 }
1729 case PyUnicode_2BYTE_KIND: {
1730 Py_UCS2 *start = (Py_UCS2 *)data + index;
1731 Py_UCS2 *ucs2 = start;
1732 assert(index <= PyUnicode_GET_LENGTH(unicode));
1733
Victor Stinner184252a2012-06-16 02:57:41 +02001734 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001735 *ucs2 = (Py_UCS2)*str;
1736
1737 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02001738 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001739 }
1740 default: {
1741 Py_UCS4 *start = (Py_UCS4 *)data + index;
1742 Py_UCS4 *ucs4 = start;
1743 assert(kind == PyUnicode_4BYTE_KIND);
1744 assert(index <= PyUnicode_GET_LENGTH(unicode));
1745
Victor Stinner184252a2012-06-16 02:57:41 +02001746 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001747 *ucs4 = (Py_UCS4)*str;
1748
1749 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01001750 }
1751 }
1752}
1753
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001754static PyObject*
1755get_latin1_char(unsigned char ch)
1756{
Victor Stinnera464fc12011-10-02 20:39:30 +02001757 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001758 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001759 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001760 if (!unicode)
1761 return NULL;
1762 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001763 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001764 unicode_latin1[ch] = unicode;
1765 }
1766 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001767 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001768}
1769
Victor Stinner985a82a2014-01-03 12:53:47 +01001770static PyObject*
1771unicode_char(Py_UCS4 ch)
1772{
1773 PyObject *unicode;
1774
1775 assert(ch <= MAX_UNICODE);
1776
Victor Stinnerf3b46b42014-01-03 13:16:00 +01001777 if (ch < 256)
1778 return get_latin1_char(ch);
1779
Victor Stinner985a82a2014-01-03 12:53:47 +01001780 unicode = PyUnicode_New(1, ch);
1781 if (unicode == NULL)
1782 return NULL;
1783 switch (PyUnicode_KIND(unicode)) {
1784 case PyUnicode_1BYTE_KIND:
1785 PyUnicode_1BYTE_DATA(unicode)[0] = (Py_UCS1)ch;
1786 break;
1787 case PyUnicode_2BYTE_KIND:
1788 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
1789 break;
1790 default:
1791 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1792 PyUnicode_4BYTE_DATA(unicode)[0] = ch;
1793 }
1794 assert(_PyUnicode_CheckConsistency(unicode, 1));
1795 return unicode;
1796}
1797
Alexander Belopolsky40018472011-02-26 01:02:56 +00001798PyObject *
1799PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001800{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001801 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001802 Py_UCS4 maxchar = 0;
1803 Py_ssize_t num_surrogates;
1804
1805 if (u == NULL)
1806 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001807
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001808 /* If the Unicode data is known at construction time, we can apply
1809 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001810
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001811 /* Optimization for empty strings */
Serhiy Storchaka678db842013-01-26 12:16:36 +02001812 if (size == 0)
1813 _Py_RETURN_UNICODE_EMPTY();
Tim Petersced69f82003-09-16 20:30:58 +00001814
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001815 /* Single character Unicode objects in the Latin-1 range are
1816 shared when using this constructor */
Victor Stinnerd21b58c2013-02-26 00:15:54 +01001817 if (size == 1 && (Py_UCS4)*u < 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001818 return get_latin1_char((unsigned char)*u);
1819
1820 /* If not empty and not single character, copy the Unicode data
1821 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001822 if (find_maxchar_surrogates(u, u + size,
1823 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001824 return NULL;
1825
Victor Stinner8faf8212011-12-08 22:14:11 +01001826 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001827 if (!unicode)
1828 return NULL;
1829
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001830 switch (PyUnicode_KIND(unicode)) {
1831 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001832 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001833 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1834 break;
1835 case PyUnicode_2BYTE_KIND:
1836#if Py_UNICODE_SIZE == 2
1837 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1838#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001839 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001840 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1841#endif
1842 break;
1843 case PyUnicode_4BYTE_KIND:
1844#if SIZEOF_WCHAR_T == 2
1845 /* This is the only case which has to process surrogates, thus
1846 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001847 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001848#else
1849 assert(num_surrogates == 0);
1850 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1851#endif
1852 break;
1853 default:
1854 assert(0 && "Impossible state");
1855 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001856
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001857 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001858}
1859
Alexander Belopolsky40018472011-02-26 01:02:56 +00001860PyObject *
1861PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001862{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001863 if (size < 0) {
1864 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001865 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001866 return NULL;
1867 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001868 if (u != NULL)
1869 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
1870 else
1871 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001872}
1873
Alexander Belopolsky40018472011-02-26 01:02:56 +00001874PyObject *
1875PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001876{
1877 size_t size = strlen(u);
1878 if (size > PY_SSIZE_T_MAX) {
1879 PyErr_SetString(PyExc_OverflowError, "input too long");
1880 return NULL;
1881 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001882 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001883}
1884
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001885PyObject *
1886_PyUnicode_FromId(_Py_Identifier *id)
1887{
1888 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01001889 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
1890 strlen(id->string),
1891 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001892 if (!id->object)
1893 return NULL;
1894 PyUnicode_InternInPlace(&id->object);
1895 assert(!id->next);
1896 id->next = static_strings;
1897 static_strings = id;
1898 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001899 return id->object;
1900}
1901
1902void
1903_PyUnicode_ClearStaticStrings()
1904{
Benjamin Peterson0c270a82013-01-09 09:52:01 -06001905 _Py_Identifier *tmp, *s = static_strings;
1906 while (s) {
Serhiy Storchaka505ff752014-02-09 13:33:53 +02001907 Py_CLEAR(s->object);
Benjamin Peterson0c270a82013-01-09 09:52:01 -06001908 tmp = s->next;
1909 s->next = NULL;
1910 s = tmp;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001911 }
Benjamin Peterson0c270a82013-01-09 09:52:01 -06001912 static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001913}
1914
Benjamin Peterson0df54292012-03-26 14:50:32 -04001915/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001916
Victor Stinnerd3f08822012-05-29 12:57:52 +02001917PyObject*
1918_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02001919{
Victor Stinnerd3f08822012-05-29 12:57:52 +02001920 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01001921 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01001922 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02001923#ifdef Py_DEBUG
Victor Stinnerd21b58c2013-02-26 00:15:54 +01001924 assert((unsigned char)s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02001925#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001926 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01001927 }
Victor Stinner785938e2011-12-11 20:09:03 +01001928 unicode = PyUnicode_New(size, 127);
1929 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02001930 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01001931 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
1932 assert(_PyUnicode_CheckConsistency(unicode, 1));
1933 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02001934}
1935
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001936static Py_UCS4
1937kind_maxchar_limit(unsigned int kind)
1938{
Benjamin Petersonead6b532011-12-20 17:23:42 -06001939 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001940 case PyUnicode_1BYTE_KIND:
1941 return 0x80;
1942 case PyUnicode_2BYTE_KIND:
1943 return 0x100;
1944 case PyUnicode_4BYTE_KIND:
1945 return 0x10000;
1946 default:
1947 assert(0 && "invalid kind");
Victor Stinner8faf8212011-12-08 22:14:11 +01001948 return MAX_UNICODE;
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001949 }
1950}
1951
Victor Stinnere6abb482012-05-02 01:15:40 +02001952Py_LOCAL_INLINE(Py_UCS4)
1953align_maxchar(Py_UCS4 maxchar)
1954{
1955 if (maxchar <= 127)
1956 return 127;
1957 else if (maxchar <= 255)
1958 return 255;
1959 else if (maxchar <= 65535)
1960 return 65535;
1961 else
1962 return MAX_UNICODE;
1963}
1964
Victor Stinner702c7342011-10-05 13:50:52 +02001965static PyObject*
Victor Stinnerd21b58c2013-02-26 00:15:54 +01001966_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001967{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001968 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001969 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001970
Serhiy Storchaka678db842013-01-26 12:16:36 +02001971 if (size == 0)
1972 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001973 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001974 if (size == 1)
1975 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001976
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001977 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001978 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001979 if (!res)
1980 return NULL;
1981 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001982 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001983 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001984}
1985
Victor Stinnere57b1c02011-09-28 22:20:48 +02001986static PyObject*
1987_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001988{
1989 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001990 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001991
Serhiy Storchaka678db842013-01-26 12:16:36 +02001992 if (size == 0)
1993 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001994 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01001995 if (size == 1)
1996 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001997
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001998 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001999 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002000 if (!res)
2001 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002002 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002003 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002004 else {
2005 _PyUnicode_CONVERT_BYTES(
2006 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2007 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002008 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002009 return res;
2010}
2011
Victor Stinnere57b1c02011-09-28 22:20:48 +02002012static PyObject*
2013_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002014{
2015 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002016 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002017
Serhiy Storchaka678db842013-01-26 12:16:36 +02002018 if (size == 0)
2019 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002020 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002021 if (size == 1)
2022 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002023
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002024 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002025 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002026 if (!res)
2027 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002028 if (max_char < 256)
2029 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2030 PyUnicode_1BYTE_DATA(res));
2031 else if (max_char < 0x10000)
2032 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2033 PyUnicode_2BYTE_DATA(res));
2034 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002035 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002036 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002037 return res;
2038}
2039
2040PyObject*
2041PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2042{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002043 if (size < 0) {
2044 PyErr_SetString(PyExc_ValueError, "size must be positive");
2045 return NULL;
2046 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002047 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002048 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002049 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002050 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002051 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002052 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002053 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002054 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002055 PyErr_SetString(PyExc_SystemError, "invalid kind");
2056 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002057 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002058}
2059
Victor Stinnerece58de2012-04-23 23:36:38 +02002060Py_UCS4
2061_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2062{
2063 enum PyUnicode_Kind kind;
2064 void *startptr, *endptr;
2065
2066 assert(PyUnicode_IS_READY(unicode));
2067 assert(0 <= start);
2068 assert(end <= PyUnicode_GET_LENGTH(unicode));
2069 assert(start <= end);
2070
2071 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2072 return PyUnicode_MAX_CHAR_VALUE(unicode);
2073
2074 if (start == end)
2075 return 127;
2076
Victor Stinner94d558b2012-04-27 22:26:58 +02002077 if (PyUnicode_IS_ASCII(unicode))
2078 return 127;
2079
Victor Stinnerece58de2012-04-23 23:36:38 +02002080 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002081 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002082 endptr = (char *)startptr + end * kind;
2083 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002084 switch(kind) {
2085 case PyUnicode_1BYTE_KIND:
2086 return ucs1lib_find_max_char(startptr, endptr);
2087 case PyUnicode_2BYTE_KIND:
2088 return ucs2lib_find_max_char(startptr, endptr);
2089 case PyUnicode_4BYTE_KIND:
2090 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002091 default:
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002092 assert(0);
2093 return 0;
Victor Stinnerece58de2012-04-23 23:36:38 +02002094 }
2095}
2096
Victor Stinner25a4b292011-10-06 12:31:55 +02002097/* Ensure that a string uses the most efficient storage, if it is not the
2098 case: create a new string with of the right kind. Write NULL into *p_unicode
2099 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002100static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002101unicode_adjust_maxchar(PyObject **p_unicode)
2102{
2103 PyObject *unicode, *copy;
2104 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002105 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002106 unsigned int kind;
2107
2108 assert(p_unicode != NULL);
2109 unicode = *p_unicode;
2110 assert(PyUnicode_IS_READY(unicode));
2111 if (PyUnicode_IS_ASCII(unicode))
2112 return;
2113
2114 len = PyUnicode_GET_LENGTH(unicode);
2115 kind = PyUnicode_KIND(unicode);
2116 if (kind == PyUnicode_1BYTE_KIND) {
2117 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002118 max_char = ucs1lib_find_max_char(u, u + len);
2119 if (max_char >= 128)
2120 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002121 }
2122 else if (kind == PyUnicode_2BYTE_KIND) {
2123 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002124 max_char = ucs2lib_find_max_char(u, u + len);
2125 if (max_char >= 256)
2126 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002127 }
2128 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002129 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02002130 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002131 max_char = ucs4lib_find_max_char(u, u + len);
2132 if (max_char >= 0x10000)
2133 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002134 }
Victor Stinner25a4b292011-10-06 12:31:55 +02002135 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002136 if (copy != NULL)
2137 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002138 Py_DECREF(unicode);
2139 *p_unicode = copy;
2140}
2141
Victor Stinner034f6cf2011-09-30 02:26:44 +02002142PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002143_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002144{
Victor Stinner87af4f22011-11-21 23:03:47 +01002145 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002146 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002147
Victor Stinner034f6cf2011-09-30 02:26:44 +02002148 if (!PyUnicode_Check(unicode)) {
2149 PyErr_BadInternalCall();
2150 return NULL;
2151 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002152 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002153 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002154
Victor Stinner87af4f22011-11-21 23:03:47 +01002155 length = PyUnicode_GET_LENGTH(unicode);
2156 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002157 if (!copy)
2158 return NULL;
2159 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2160
Victor Stinner87af4f22011-11-21 23:03:47 +01002161 Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2162 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002163 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002164 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002165}
2166
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002167
Victor Stinnerbc603d12011-10-02 01:00:40 +02002168/* Widen Unicode objects to larger buffers. Don't write terminating null
2169 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002170
2171void*
2172_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2173{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002174 Py_ssize_t len;
2175 void *result;
2176 unsigned int skind;
2177
Benjamin Petersonbac79492012-01-14 13:34:47 -05002178 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02002179 return NULL;
2180
2181 len = PyUnicode_GET_LENGTH(s);
2182 skind = PyUnicode_KIND(s);
2183 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002184 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002185 return NULL;
2186 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002187 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002188 case PyUnicode_2BYTE_KIND:
2189 result = PyMem_Malloc(len * sizeof(Py_UCS2));
2190 if (!result)
2191 return PyErr_NoMemory();
2192 assert(skind == PyUnicode_1BYTE_KIND);
2193 _PyUnicode_CONVERT_BYTES(
2194 Py_UCS1, Py_UCS2,
2195 PyUnicode_1BYTE_DATA(s),
2196 PyUnicode_1BYTE_DATA(s) + len,
2197 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002198 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002199 case PyUnicode_4BYTE_KIND:
2200 result = PyMem_Malloc(len * sizeof(Py_UCS4));
2201 if (!result)
2202 return PyErr_NoMemory();
2203 if (skind == PyUnicode_2BYTE_KIND) {
2204 _PyUnicode_CONVERT_BYTES(
2205 Py_UCS2, Py_UCS4,
2206 PyUnicode_2BYTE_DATA(s),
2207 PyUnicode_2BYTE_DATA(s) + len,
2208 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002209 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002210 else {
2211 assert(skind == PyUnicode_1BYTE_KIND);
2212 _PyUnicode_CONVERT_BYTES(
2213 Py_UCS1, Py_UCS4,
2214 PyUnicode_1BYTE_DATA(s),
2215 PyUnicode_1BYTE_DATA(s) + len,
2216 result);
2217 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002218 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002219 default:
2220 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002221 }
Victor Stinner01698042011-10-04 00:04:26 +02002222 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002223 return NULL;
2224}
2225
2226static Py_UCS4*
2227as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2228 int copy_null)
2229{
2230 int kind;
2231 void *data;
2232 Py_ssize_t len, targetlen;
2233 if (PyUnicode_READY(string) == -1)
2234 return NULL;
2235 kind = PyUnicode_KIND(string);
2236 data = PyUnicode_DATA(string);
2237 len = PyUnicode_GET_LENGTH(string);
2238 targetlen = len;
2239 if (copy_null)
2240 targetlen++;
2241 if (!target) {
2242 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
2243 PyErr_NoMemory();
2244 return NULL;
2245 }
2246 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
2247 if (!target) {
2248 PyErr_NoMemory();
2249 return NULL;
2250 }
2251 }
2252 else {
2253 if (targetsize < targetlen) {
2254 PyErr_Format(PyExc_SystemError,
2255 "string is longer than the buffer");
2256 if (copy_null && 0 < targetsize)
2257 target[0] = 0;
2258 return NULL;
2259 }
2260 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002261 if (kind == PyUnicode_1BYTE_KIND) {
2262 Py_UCS1 *start = (Py_UCS1 *) data;
2263 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002264 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002265 else if (kind == PyUnicode_2BYTE_KIND) {
2266 Py_UCS2 *start = (Py_UCS2 *) data;
2267 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2268 }
2269 else {
2270 assert(kind == PyUnicode_4BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002271 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002272 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002273 if (copy_null)
2274 target[len] = 0;
2275 return target;
2276}
2277
2278Py_UCS4*
2279PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2280 int copy_null)
2281{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002282 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002283 PyErr_BadInternalCall();
2284 return NULL;
2285 }
2286 return as_ucs4(string, target, targetsize, copy_null);
2287}
2288
2289Py_UCS4*
2290PyUnicode_AsUCS4Copy(PyObject *string)
2291{
2292 return as_ucs4(string, NULL, 0, 1);
2293}
2294
2295#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00002296
Alexander Belopolsky40018472011-02-26 01:02:56 +00002297PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02002298PyUnicode_FromWideChar(const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002299{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002300 if (w == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002301 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02002302 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson29060642009-01-31 22:14:21 +00002303 PyErr_BadInternalCall();
2304 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002305 }
2306
Martin v. Löwis790465f2008-04-05 20:41:37 +00002307 if (size == -1) {
2308 size = wcslen(w);
2309 }
2310
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002311 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002312}
2313
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002314#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00002315
Walter Dörwald346737f2007-05-31 10:44:43 +00002316static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002317makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
Victor Stinnere215d962012-10-06 23:03:36 +02002318 char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00002319{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002320 *fmt++ = '%';
Benjamin Peterson14339b62009-01-31 16:36:08 +00002321 if (longflag)
2322 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002323 else if (longlongflag) {
2324 /* longlongflag should only ever be nonzero on machines with
2325 HAVE_LONG_LONG defined */
2326#ifdef HAVE_LONG_LONG
2327 char *f = PY_FORMAT_LONG_LONG;
2328 while (*f)
2329 *fmt++ = *f++;
2330#else
2331 /* we shouldn't ever get here */
2332 assert(0);
2333 *fmt++ = 'l';
2334#endif
2335 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002336 else if (size_tflag) {
2337 char *f = PY_FORMAT_SIZE_T;
2338 while (*f)
2339 *fmt++ = *f++;
2340 }
2341 *fmt++ = c;
2342 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00002343}
2344
Victor Stinner15a11362012-10-06 23:48:20 +02002345/* maximum number of characters required for output of %lld or %p.
Victor Stinnere215d962012-10-06 23:03:36 +02002346 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2347 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2348#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
Victor Stinner96865452011-03-01 23:44:09 +00002349
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002350static int
2351unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2352 Py_ssize_t width, Py_ssize_t precision)
2353{
2354 Py_ssize_t length, fill, arglen;
2355 Py_UCS4 maxchar;
2356
2357 if (PyUnicode_READY(str) == -1)
2358 return -1;
2359
2360 length = PyUnicode_GET_LENGTH(str);
2361 if ((precision == -1 || precision >= length)
2362 && width <= length)
2363 return _PyUnicodeWriter_WriteStr(writer, str);
2364
2365 if (precision != -1)
2366 length = Py_MIN(precision, length);
2367
2368 arglen = Py_MAX(length, width);
2369 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2370 maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2371 else
2372 maxchar = writer->maxchar;
2373
2374 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2375 return -1;
2376
2377 if (width > length) {
2378 fill = width - length;
2379 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2380 return -1;
2381 writer->pos += fill;
2382 }
2383
2384 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2385 str, 0, length);
2386 writer->pos += length;
2387 return 0;
2388}
2389
2390static int
2391unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
2392 Py_ssize_t width, Py_ssize_t precision)
2393{
2394 /* UTF-8 */
2395 Py_ssize_t length;
2396 PyObject *unicode;
2397 int res;
2398
2399 length = strlen(str);
2400 if (precision != -1)
2401 length = Py_MIN(length, precision);
2402 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2403 if (unicode == NULL)
2404 return -1;
2405
2406 res = unicode_fromformat_write_str(writer, unicode, width, -1);
2407 Py_DECREF(unicode);
2408 return res;
2409}
2410
Victor Stinner96865452011-03-01 23:44:09 +00002411static const char*
Victor Stinnere215d962012-10-06 23:03:36 +02002412unicode_fromformat_arg(_PyUnicodeWriter *writer,
2413 const char *f, va_list *vargs)
Victor Stinner96865452011-03-01 23:44:09 +00002414{
Victor Stinnere215d962012-10-06 23:03:36 +02002415 const char *p;
2416 Py_ssize_t len;
2417 int zeropad;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002418 Py_ssize_t width;
2419 Py_ssize_t precision;
Victor Stinnere215d962012-10-06 23:03:36 +02002420 int longflag;
2421 int longlongflag;
2422 int size_tflag;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002423 Py_ssize_t fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002424
2425 p = f;
2426 f++;
Victor Stinner4c63a972012-10-06 23:55:33 +02002427 zeropad = 0;
2428 if (*f == '0') {
2429 zeropad = 1;
2430 f++;
2431 }
Victor Stinner96865452011-03-01 23:44:09 +00002432
2433 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002434 width = -1;
2435 if (Py_ISDIGIT((unsigned)*f)) {
2436 width = *f - '0';
Victor Stinner96865452011-03-01 23:44:09 +00002437 f++;
Victor Stinnere215d962012-10-06 23:03:36 +02002438 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002439 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
Victor Stinner3921e902012-10-06 23:05:00 +02002440 PyErr_SetString(PyExc_ValueError,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002441 "width too big");
Victor Stinner3921e902012-10-06 23:05:00 +02002442 return NULL;
2443 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002444 width = (width * 10) + (*f - '0');
Victor Stinnere215d962012-10-06 23:03:36 +02002445 f++;
2446 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002447 }
2448 precision = -1;
2449 if (*f == '.') {
2450 f++;
2451 if (Py_ISDIGIT((unsigned)*f)) {
2452 precision = (*f - '0');
2453 f++;
2454 while (Py_ISDIGIT((unsigned)*f)) {
2455 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2456 PyErr_SetString(PyExc_ValueError,
2457 "precision too big");
2458 return NULL;
2459 }
2460 precision = (precision * 10) + (*f - '0');
2461 f++;
2462 }
2463 }
Victor Stinner96865452011-03-01 23:44:09 +00002464 if (*f == '%') {
2465 /* "%.3%s" => f points to "3" */
2466 f--;
2467 }
2468 }
2469 if (*f == '\0') {
Victor Stinnere215d962012-10-06 23:03:36 +02002470 /* bogus format "%.123" => go backward, f points to "3" */
Victor Stinner96865452011-03-01 23:44:09 +00002471 f--;
2472 }
Victor Stinner96865452011-03-01 23:44:09 +00002473
2474 /* Handle %ld, %lu, %lld and %llu. */
2475 longflag = 0;
2476 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002477 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002478 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002479 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002480 longflag = 1;
2481 ++f;
2482 }
2483#ifdef HAVE_LONG_LONG
2484 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002485 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002486 longlongflag = 1;
2487 f += 2;
2488 }
2489#endif
2490 }
2491 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002492 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002493 size_tflag = 1;
2494 ++f;
2495 }
Victor Stinnere215d962012-10-06 23:03:36 +02002496
2497 if (f[1] == '\0')
2498 writer->overallocate = 0;
2499
2500 switch (*f) {
2501 case 'c':
2502 {
2503 int ordinal = va_arg(*vargs, int);
Victor Stinnerff5a8482012-10-06 23:05:45 +02002504 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Serhiy Storchakac89533f2013-06-23 20:21:16 +03002505 PyErr_SetString(PyExc_OverflowError,
Victor Stinnerff5a8482012-10-06 23:05:45 +02002506 "character argument not in range(0x110000)");
2507 return NULL;
2508 }
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002509 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002510 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002511 break;
2512 }
2513
2514 case 'i':
2515 case 'd':
2516 case 'u':
2517 case 'x':
2518 {
2519 /* used by sprintf */
2520 char fmt[10]; /* should be enough for "%0lld\0" */
Victor Stinner15a11362012-10-06 23:48:20 +02002521 char buffer[MAX_LONG_LONG_CHARS];
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002522 Py_ssize_t arglen;
Victor Stinnere215d962012-10-06 23:03:36 +02002523
2524 if (*f == 'u') {
2525 makefmt(fmt, longflag, longlongflag, size_tflag, *f);
2526
2527 if (longflag)
2528 len = sprintf(buffer, fmt,
2529 va_arg(*vargs, unsigned long));
2530#ifdef HAVE_LONG_LONG
2531 else if (longlongflag)
2532 len = sprintf(buffer, fmt,
2533 va_arg(*vargs, unsigned PY_LONG_LONG));
2534#endif
2535 else if (size_tflag)
2536 len = sprintf(buffer, fmt,
2537 va_arg(*vargs, size_t));
2538 else
2539 len = sprintf(buffer, fmt,
2540 va_arg(*vargs, unsigned int));
2541 }
2542 else if (*f == 'x') {
2543 makefmt(fmt, 0, 0, 0, 'x');
2544 len = sprintf(buffer, fmt, va_arg(*vargs, int));
2545 }
2546 else {
2547 makefmt(fmt, longflag, longlongflag, size_tflag, *f);
2548
2549 if (longflag)
2550 len = sprintf(buffer, fmt,
2551 va_arg(*vargs, long));
2552#ifdef HAVE_LONG_LONG
2553 else if (longlongflag)
2554 len = sprintf(buffer, fmt,
2555 va_arg(*vargs, PY_LONG_LONG));
2556#endif
2557 else if (size_tflag)
2558 len = sprintf(buffer, fmt,
2559 va_arg(*vargs, Py_ssize_t));
2560 else
2561 len = sprintf(buffer, fmt,
2562 va_arg(*vargs, int));
2563 }
2564 assert(len >= 0);
2565
Victor Stinnere215d962012-10-06 23:03:36 +02002566 if (precision < len)
2567 precision = len;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002568
2569 arglen = Py_MAX(precision, width);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002570 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2571 return NULL;
2572
Victor Stinnere215d962012-10-06 23:03:36 +02002573 if (width > precision) {
2574 Py_UCS4 fillchar;
2575 fill = width - precision;
2576 fillchar = zeropad?'0':' ';
Victor Stinner15a11362012-10-06 23:48:20 +02002577 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2578 return NULL;
2579 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002580 }
Victor Stinner15a11362012-10-06 23:48:20 +02002581 if (precision > len) {
Victor Stinnere215d962012-10-06 23:03:36 +02002582 fill = precision - len;
Victor Stinner15a11362012-10-06 23:48:20 +02002583 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2584 return NULL;
2585 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002586 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002587
Victor Stinner4a587072013-11-19 12:54:53 +01002588 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2589 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002590 break;
2591 }
2592
2593 case 'p':
2594 {
2595 char number[MAX_LONG_LONG_CHARS];
2596
2597 len = sprintf(number, "%p", va_arg(*vargs, void*));
2598 assert(len >= 0);
2599
2600 /* %p is ill-defined: ensure leading 0x. */
2601 if (number[1] == 'X')
2602 number[1] = 'x';
2603 else if (number[1] != 'x') {
2604 memmove(number + 2, number,
2605 strlen(number) + 1);
2606 number[0] = '0';
2607 number[1] = 'x';
2608 len += 2;
2609 }
2610
Victor Stinner4a587072013-11-19 12:54:53 +01002611 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002612 return NULL;
2613 break;
2614 }
2615
2616 case 's':
2617 {
2618 /* UTF-8 */
2619 const char *s = va_arg(*vargs, const char*);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002620 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002621 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002622 break;
2623 }
2624
2625 case 'U':
2626 {
2627 PyObject *obj = va_arg(*vargs, PyObject *);
2628 assert(obj && _PyUnicode_CHECK(obj));
2629
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002630 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002631 return NULL;
2632 break;
2633 }
2634
2635 case 'V':
2636 {
2637 PyObject *obj = va_arg(*vargs, PyObject *);
2638 const char *str = va_arg(*vargs, const char *);
Victor Stinnere215d962012-10-06 23:03:36 +02002639 if (obj) {
2640 assert(_PyUnicode_CHECK(obj));
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002641 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002642 return NULL;
2643 }
2644 else {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002645 assert(str != NULL);
2646 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002647 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002648 }
2649 break;
2650 }
2651
2652 case 'S':
2653 {
2654 PyObject *obj = va_arg(*vargs, PyObject *);
2655 PyObject *str;
2656 assert(obj);
2657 str = PyObject_Str(obj);
2658 if (!str)
2659 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002660 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002661 Py_DECREF(str);
2662 return NULL;
2663 }
2664 Py_DECREF(str);
2665 break;
2666 }
2667
2668 case 'R':
2669 {
2670 PyObject *obj = va_arg(*vargs, PyObject *);
2671 PyObject *repr;
2672 assert(obj);
2673 repr = PyObject_Repr(obj);
2674 if (!repr)
2675 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002676 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002677 Py_DECREF(repr);
2678 return NULL;
2679 }
2680 Py_DECREF(repr);
2681 break;
2682 }
2683
2684 case 'A':
2685 {
2686 PyObject *obj = va_arg(*vargs, PyObject *);
2687 PyObject *ascii;
2688 assert(obj);
2689 ascii = PyObject_ASCII(obj);
2690 if (!ascii)
2691 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002692 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002693 Py_DECREF(ascii);
2694 return NULL;
2695 }
2696 Py_DECREF(ascii);
2697 break;
2698 }
2699
2700 case '%':
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002701 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002702 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002703 break;
2704
2705 default:
2706 /* if we stumble upon an unknown formatting code, copy the rest
2707 of the format string to the output string. (we cannot just
2708 skip the code, since there's no way to know what's in the
2709 argument list) */
2710 len = strlen(p);
Victor Stinner4a587072013-11-19 12:54:53 +01002711 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002712 return NULL;
2713 f = p+len;
2714 return f;
2715 }
2716
2717 f++;
Victor Stinner96865452011-03-01 23:44:09 +00002718 return f;
2719}
2720
Walter Dörwaldd2034312007-05-18 16:29:38 +00002721PyObject *
2722PyUnicode_FromFormatV(const char *format, va_list vargs)
2723{
Victor Stinnere215d962012-10-06 23:03:36 +02002724 va_list vargs2;
2725 const char *f;
2726 _PyUnicodeWriter writer;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002727
Victor Stinner8f674cc2013-04-17 23:02:17 +02002728 _PyUnicodeWriter_Init(&writer);
2729 writer.min_length = strlen(format) + 100;
2730 writer.overallocate = 1;
Victor Stinnere215d962012-10-06 23:03:36 +02002731
2732 /* va_list may be an array (of 1 item) on some platforms (ex: AMD64).
2733 Copy it to be able to pass a reference to a subfunction. */
2734 Py_VA_COPY(vargs2, vargs);
2735
2736 for (f = format; *f; ) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002737 if (*f == '%') {
Victor Stinnere215d962012-10-06 23:03:36 +02002738 f = unicode_fromformat_arg(&writer, f, &vargs2);
2739 if (f == NULL)
2740 goto fail;
Victor Stinner1205f272010-09-11 00:54:47 +00002741 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002742 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002743 const char *p;
2744 Py_ssize_t len;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002745
Victor Stinnere215d962012-10-06 23:03:36 +02002746 p = f;
2747 do
2748 {
2749 if ((unsigned char)*p > 127) {
2750 PyErr_Format(PyExc_ValueError,
2751 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2752 "string, got a non-ASCII byte: 0x%02x",
2753 (unsigned char)*p);
2754 return NULL;
2755 }
2756 p++;
2757 }
2758 while (*p != '\0' && *p != '%');
2759 len = p - f;
2760
2761 if (*p == '\0')
2762 writer.overallocate = 0;
Victor Stinner4a587072013-11-19 12:54:53 +01002763
2764 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002765 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02002766
2767 f = p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002768 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002769 }
Victor Stinnere215d962012-10-06 23:03:36 +02002770 return _PyUnicodeWriter_Finish(&writer);
2771
2772 fail:
2773 _PyUnicodeWriter_Dealloc(&writer);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002774 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002775}
2776
Walter Dörwaldd2034312007-05-18 16:29:38 +00002777PyObject *
2778PyUnicode_FromFormat(const char *format, ...)
2779{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002780 PyObject* ret;
2781 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002782
2783#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002784 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002785#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002786 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002787#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002788 ret = PyUnicode_FromFormatV(format, vargs);
2789 va_end(vargs);
2790 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002791}
2792
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002793#ifdef HAVE_WCHAR_H
2794
Victor Stinner5593d8a2010-10-02 11:11:27 +00002795/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2796 convert a Unicode object to a wide character string.
2797
Victor Stinnerd88d9832011-09-06 02:00:05 +02002798 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002799 character) required to convert the unicode object. Ignore size argument.
2800
Victor Stinnerd88d9832011-09-06 02:00:05 +02002801 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002802 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002803 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002804static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002805unicode_aswidechar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002806 wchar_t *w,
2807 Py_ssize_t size)
2808{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002809 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002810 const wchar_t *wstr;
2811
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002812 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002813 if (wstr == NULL)
2814 return -1;
2815
Victor Stinner5593d8a2010-10-02 11:11:27 +00002816 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002817 if (size > res)
2818 size = res + 1;
2819 else
2820 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002821 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002822 return res;
2823 }
2824 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002825 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002826}
2827
2828Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002829PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002830 wchar_t *w,
2831 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002832{
2833 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002834 PyErr_BadInternalCall();
2835 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002836 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002837 return unicode_aswidechar(unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002838}
2839
Victor Stinner137c34c2010-09-29 10:25:54 +00002840wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002841PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002842 Py_ssize_t *size)
2843{
2844 wchar_t* buffer;
2845 Py_ssize_t buflen;
2846
2847 if (unicode == NULL) {
2848 PyErr_BadInternalCall();
2849 return NULL;
2850 }
2851
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002852 buflen = unicode_aswidechar(unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002853 if (buflen == -1)
2854 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002855 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002856 PyErr_NoMemory();
2857 return NULL;
2858 }
2859
Victor Stinner137c34c2010-09-29 10:25:54 +00002860 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2861 if (buffer == NULL) {
2862 PyErr_NoMemory();
2863 return NULL;
2864 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002865 buflen = unicode_aswidechar(unicode, buffer, buflen);
Stefan Krah8528c312012-08-19 21:52:43 +02002866 if (buflen == -1) {
2867 PyMem_FREE(buffer);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002868 return NULL;
Stefan Krah8528c312012-08-19 21:52:43 +02002869 }
Victor Stinner5593d8a2010-10-02 11:11:27 +00002870 if (size != NULL)
2871 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002872 return buffer;
2873}
2874
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002875#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002876
Alexander Belopolsky40018472011-02-26 01:02:56 +00002877PyObject *
2878PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002879{
Victor Stinner8faf8212011-12-08 22:14:11 +01002880 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002881 PyErr_SetString(PyExc_ValueError,
2882 "chr() arg not in range(0x110000)");
2883 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002884 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002885
Victor Stinner985a82a2014-01-03 12:53:47 +01002886 return unicode_char((Py_UCS4)ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002887}
2888
Alexander Belopolsky40018472011-02-26 01:02:56 +00002889PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02002890PyUnicode_FromObject(PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002891{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002892 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002893 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002894 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05002895 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002896 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002897 Py_INCREF(obj);
2898 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002899 }
2900 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002901 /* For a Unicode subtype that's not a Unicode object,
2902 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002903 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002904 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002905 PyErr_Format(PyExc_TypeError,
2906 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002907 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002908 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002909}
2910
Alexander Belopolsky40018472011-02-26 01:02:56 +00002911PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02002912PyUnicode_FromEncodedObject(PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002913 const char *encoding,
2914 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002915{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002916 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002917 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002918
Guido van Rossumd57fd912000-03-10 22:53:23 +00002919 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002920 PyErr_BadInternalCall();
2921 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002922 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002923
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002924 /* Decoding bytes objects is the most common case and should be fast */
2925 if (PyBytes_Check(obj)) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02002926 if (PyBytes_GET_SIZE(obj) == 0)
2927 _Py_RETURN_UNICODE_EMPTY();
2928 v = PyUnicode_Decode(
2929 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2930 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002931 return v;
2932 }
2933
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002934 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002935 PyErr_SetString(PyExc_TypeError,
2936 "decoding str is not supported");
2937 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002938 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002939
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002940 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2941 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2942 PyErr_Format(PyExc_TypeError,
Serhiy Storchakab757c832014-12-05 22:25:22 +02002943 "coercing to str: need a bytes-like object, %.80s found",
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002944 Py_TYPE(obj)->tp_name);
2945 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002946 }
Tim Petersced69f82003-09-16 20:30:58 +00002947
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002948 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02002949 PyBuffer_Release(&buffer);
2950 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00002951 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002952
Serhiy Storchaka05997252013-01-26 12:14:02 +02002953 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002954 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002955 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002956}
2957
Victor Stinner600d3be2010-06-10 12:00:55 +00002958/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002959 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2960 1 on success. */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01002961int
2962_Py_normalize_encoding(const char *encoding,
2963 char *lower,
2964 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002965{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002966 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002967 char *l;
2968 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002969
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04002970 if (encoding == NULL) {
Victor Stinner66b32702013-11-07 23:12:23 +01002971 /* 6 == strlen("utf-8") + 1 */
Victor Stinnerdf23e302013-11-07 13:33:36 +01002972 if (lower_len < 6)
2973 return 0;
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04002974 strcpy(lower, "utf-8");
2975 return 1;
2976 }
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002977 e = encoding;
2978 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002979 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002980 while (*e) {
2981 if (l == l_end)
2982 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002983 if (Py_ISUPPER(*e)) {
2984 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002985 }
2986 else if (*e == '_') {
2987 *l++ = '-';
2988 e++;
2989 }
2990 else {
2991 *l++ = *e++;
2992 }
2993 }
2994 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002995 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002996}
2997
Alexander Belopolsky40018472011-02-26 01:02:56 +00002998PyObject *
2999PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003000 Py_ssize_t size,
3001 const char *encoding,
3002 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00003003{
3004 PyObject *buffer = NULL, *unicode;
3005 Py_buffer info;
3006 char lower[11]; /* Enough for any encoding shortcut */
3007
Fred Drakee4315f52000-05-09 19:53:39 +00003008 /* Shortcuts for common default encodings */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003009 if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003010 if ((strcmp(lower, "utf-8") == 0) ||
3011 (strcmp(lower, "utf8") == 0))
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003012 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
Victor Stinner37296e82010-06-10 13:36:23 +00003013 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003014 (strcmp(lower, "latin1") == 0) ||
Victor Stinnerfa3ba4c2013-10-29 11:34:05 +01003015 (strcmp(lower, "iso-8859-1") == 0) ||
3016 (strcmp(lower, "iso8859-1") == 0))
Victor Stinner37296e82010-06-10 13:36:23 +00003017 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003018#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00003019 else if (strcmp(lower, "mbcs") == 0)
3020 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003021#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003022 else if (strcmp(lower, "ascii") == 0)
3023 return PyUnicode_DecodeASCII(s, size, errors);
3024 else if (strcmp(lower, "utf-16") == 0)
3025 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3026 else if (strcmp(lower, "utf-32") == 0)
3027 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3028 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003029
3030 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003031 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00003032 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003033 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003034 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003035 if (buffer == NULL)
3036 goto onError;
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003037 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003038 if (unicode == NULL)
3039 goto onError;
3040 if (!PyUnicode_Check(unicode)) {
3041 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003042 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3043 "use codecs.decode() to decode to arbitrary types",
3044 encoding,
3045 Py_TYPE(unicode)->tp_name, Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003046 Py_DECREF(unicode);
3047 goto onError;
3048 }
3049 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003050 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003051
Benjamin Peterson29060642009-01-31 22:14:21 +00003052 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003053 Py_XDECREF(buffer);
3054 return NULL;
3055}
3056
Alexander Belopolsky40018472011-02-26 01:02:56 +00003057PyObject *
3058PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003059 const char *encoding,
3060 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003061{
3062 PyObject *v;
3063
3064 if (!PyUnicode_Check(unicode)) {
3065 PyErr_BadArgument();
3066 goto onError;
3067 }
3068
3069 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003070 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003071
3072 /* Decode via the codec registry */
3073 v = PyCodec_Decode(unicode, encoding, errors);
3074 if (v == NULL)
3075 goto onError;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003076 return unicode_result(v);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003077
Benjamin Peterson29060642009-01-31 22:14:21 +00003078 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003079 return NULL;
3080}
3081
Alexander Belopolsky40018472011-02-26 01:02:56 +00003082PyObject *
3083PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003084 const char *encoding,
3085 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003086{
3087 PyObject *v;
3088
3089 if (!PyUnicode_Check(unicode)) {
3090 PyErr_BadArgument();
3091 goto onError;
3092 }
3093
3094 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003095 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003096
3097 /* Decode via the codec registry */
3098 v = PyCodec_Decode(unicode, encoding, errors);
3099 if (v == NULL)
3100 goto onError;
3101 if (!PyUnicode_Check(v)) {
3102 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003103 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3104 "use codecs.decode() to decode to arbitrary types",
3105 encoding,
3106 Py_TYPE(unicode)->tp_name, Py_TYPE(unicode)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003107 Py_DECREF(v);
3108 goto onError;
3109 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003110 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003111
Benjamin Peterson29060642009-01-31 22:14:21 +00003112 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003113 return NULL;
3114}
3115
Alexander Belopolsky40018472011-02-26 01:02:56 +00003116PyObject *
3117PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003118 Py_ssize_t size,
3119 const char *encoding,
3120 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003121{
3122 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003123
Guido van Rossumd57fd912000-03-10 22:53:23 +00003124 unicode = PyUnicode_FromUnicode(s, size);
3125 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003126 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003127 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3128 Py_DECREF(unicode);
3129 return v;
3130}
3131
Alexander Belopolsky40018472011-02-26 01:02:56 +00003132PyObject *
3133PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003134 const char *encoding,
3135 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003136{
3137 PyObject *v;
3138
3139 if (!PyUnicode_Check(unicode)) {
3140 PyErr_BadArgument();
3141 goto onError;
3142 }
3143
3144 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003145 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003146
3147 /* Encode via the codec registry */
3148 v = PyCodec_Encode(unicode, encoding, errors);
3149 if (v == NULL)
3150 goto onError;
3151 return v;
3152
Benjamin Peterson29060642009-01-31 22:14:21 +00003153 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003154 return NULL;
3155}
3156
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003157static size_t
3158wcstombs_errorpos(const wchar_t *wstr)
3159{
3160 size_t len;
3161#if SIZEOF_WCHAR_T == 2
3162 wchar_t buf[3];
3163#else
3164 wchar_t buf[2];
3165#endif
3166 char outbuf[MB_LEN_MAX];
3167 const wchar_t *start, *previous;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003168
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003169#if SIZEOF_WCHAR_T == 2
3170 buf[2] = 0;
3171#else
3172 buf[1] = 0;
3173#endif
3174 start = wstr;
3175 while (*wstr != L'\0')
3176 {
3177 previous = wstr;
3178#if SIZEOF_WCHAR_T == 2
3179 if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
3180 && Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
3181 {
3182 buf[0] = wstr[0];
3183 buf[1] = wstr[1];
3184 wstr += 2;
3185 }
3186 else {
3187 buf[0] = *wstr;
3188 buf[1] = 0;
3189 wstr++;
3190 }
3191#else
3192 buf[0] = *wstr;
3193 wstr++;
3194#endif
3195 len = wcstombs(outbuf, buf, sizeof(outbuf));
Victor Stinner2f197072011-12-17 07:08:30 +01003196 if (len == (size_t)-1)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003197 return previous - start;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003198 }
3199
3200 /* failed to find the unencodable character */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003201 return 0;
3202}
3203
Victor Stinner1b579672011-12-17 05:47:23 +01003204static int
3205locale_error_handler(const char *errors, int *surrogateescape)
3206{
3207 if (errors == NULL) {
3208 *surrogateescape = 0;
3209 return 0;
3210 }
3211
3212 if (strcmp(errors, "strict") == 0) {
3213 *surrogateescape = 0;
3214 return 0;
3215 }
Victor Stinner8dbd4212012-12-04 09:30:24 +01003216 if (strcmp(errors, "surrogateescape") == 0) {
Victor Stinner1b579672011-12-17 05:47:23 +01003217 *surrogateescape = 1;
3218 return 0;
3219 }
3220 PyErr_Format(PyExc_ValueError,
3221 "only 'strict' and 'surrogateescape' error handlers "
3222 "are supported, not '%s'",
3223 errors);
3224 return -1;
3225}
3226
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003227PyObject *
Victor Stinner1b579672011-12-17 05:47:23 +01003228PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003229{
3230 Py_ssize_t wlen, wlen2;
3231 wchar_t *wstr;
3232 PyObject *bytes = NULL;
3233 char *errmsg;
Raymond Hettingere56666d2013-08-04 11:51:03 -07003234 PyObject *reason = NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003235 PyObject *exc;
3236 size_t error_pos;
Victor Stinner1b579672011-12-17 05:47:23 +01003237 int surrogateescape;
3238
3239 if (locale_error_handler(errors, &surrogateescape) < 0)
3240 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003241
3242 wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3243 if (wstr == NULL)
3244 return NULL;
3245
3246 wlen2 = wcslen(wstr);
3247 if (wlen2 != wlen) {
3248 PyMem_Free(wstr);
3249 PyErr_SetString(PyExc_TypeError, "embedded null character");
3250 return NULL;
3251 }
3252
3253 if (surrogateescape) {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003254 /* "surrogateescape" error handler */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003255 char *str;
3256
3257 str = _Py_wchar2char(wstr, &error_pos);
3258 if (str == NULL) {
3259 if (error_pos == (size_t)-1) {
3260 PyErr_NoMemory();
3261 PyMem_Free(wstr);
3262 return NULL;
3263 }
3264 else {
3265 goto encode_error;
3266 }
3267 }
3268 PyMem_Free(wstr);
3269
3270 bytes = PyBytes_FromString(str);
3271 PyMem_Free(str);
3272 }
3273 else {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003274 /* strict mode */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003275 size_t len, len2;
3276
3277 len = wcstombs(NULL, wstr, 0);
3278 if (len == (size_t)-1) {
Victor Stinner2f197072011-12-17 07:08:30 +01003279 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003280 goto encode_error;
3281 }
3282
3283 bytes = PyBytes_FromStringAndSize(NULL, len);
3284 if (bytes == NULL) {
3285 PyMem_Free(wstr);
3286 return NULL;
3287 }
3288
3289 len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
3290 if (len2 == (size_t)-1 || len2 > len) {
Victor Stinner2f197072011-12-17 07:08:30 +01003291 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003292 goto encode_error;
3293 }
3294 PyMem_Free(wstr);
3295 }
3296 return bytes;
3297
3298encode_error:
3299 errmsg = strerror(errno);
3300 assert(errmsg != NULL);
Victor Stinner2f197072011-12-17 07:08:30 +01003301
3302 if (error_pos == (size_t)-1)
3303 error_pos = wcstombs_errorpos(wstr);
3304
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003305 PyMem_Free(wstr);
3306 Py_XDECREF(bytes);
3307
Victor Stinner2f197072011-12-17 07:08:30 +01003308 if (errmsg != NULL) {
3309 size_t errlen;
3310 wstr = _Py_char2wchar(errmsg, &errlen);
3311 if (wstr != NULL) {
3312 reason = PyUnicode_FromWideChar(wstr, errlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003313 PyMem_RawFree(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003314 } else
3315 errmsg = NULL;
3316 }
3317 if (errmsg == NULL)
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003318 reason = PyUnicode_FromString(
3319 "wcstombs() encountered an unencodable "
3320 "wide character");
3321 if (reason == NULL)
3322 return NULL;
3323
3324 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO",
3325 "locale", unicode,
3326 (Py_ssize_t)error_pos,
3327 (Py_ssize_t)(error_pos+1),
3328 reason);
3329 Py_DECREF(reason);
3330 if (exc != NULL) {
3331 PyCodec_StrictErrors(exc);
3332 Py_XDECREF(exc);
3333 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003334 return NULL;
3335}
3336
Victor Stinnerad158722010-10-27 00:25:46 +00003337PyObject *
3338PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003339{
Victor Stinner99b95382011-07-04 14:23:54 +02003340#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003341 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003342#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003343 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00003344#else
Victor Stinner793b5312011-04-27 00:24:21 +02003345 PyInterpreterState *interp = PyThreadState_GET()->interp;
3346 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3347 cannot use it to encode and decode filenames before it is loaded. Load
3348 the Python codec requires to encode at least its own filename. Use the C
3349 version of the locale codec until the codec registry is initialized and
3350 the Python codec is loaded.
3351
3352 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3353 cannot only rely on it: check also interp->fscodec_initialized for
3354 subinterpreters. */
3355 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003356 return PyUnicode_AsEncodedString(unicode,
3357 Py_FileSystemDefaultEncoding,
3358 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003359 }
3360 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003361 return PyUnicode_EncodeLocale(unicode, "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003362 }
Victor Stinnerad158722010-10-27 00:25:46 +00003363#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003364}
3365
Alexander Belopolsky40018472011-02-26 01:02:56 +00003366PyObject *
3367PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003368 const char *encoding,
3369 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003370{
3371 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00003372 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003373
Guido van Rossumd57fd912000-03-10 22:53:23 +00003374 if (!PyUnicode_Check(unicode)) {
3375 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003376 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003377 }
Fred Drakee4315f52000-05-09 19:53:39 +00003378
Fred Drakee4315f52000-05-09 19:53:39 +00003379 /* Shortcuts for common default encodings */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003380 if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003381 if ((strcmp(lower, "utf-8") == 0) ||
3382 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00003383 {
Victor Stinner2f283c22011-03-02 01:21:46 +00003384 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003385 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00003386 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003387 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00003388 }
Victor Stinner37296e82010-06-10 13:36:23 +00003389 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003390 (strcmp(lower, "latin1") == 0) ||
Victor Stinnerfa3ba4c2013-10-29 11:34:05 +01003391 (strcmp(lower, "iso-8859-1") == 0) ||
3392 (strcmp(lower, "iso8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003393 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003394#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003395 else if (strcmp(lower, "mbcs") == 0)
3396 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003397#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003398 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003399 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00003400 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003401
3402 /* Encode via the codec registry */
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003403 v = _PyCodec_EncodeText(unicode, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003404 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003405 return NULL;
3406
3407 /* The normal path */
3408 if (PyBytes_Check(v))
3409 return v;
3410
3411 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003412 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003413 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003414 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003415
3416 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003417 "encoder %s returned bytearray instead of bytes; "
3418 "use codecs.encode() to encode to arbitrary types",
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003419 encoding);
3420 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003421 Py_DECREF(v);
3422 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003423 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003424
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003425 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3426 Py_DECREF(v);
3427 return b;
3428 }
3429
3430 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003431 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
3432 "use codecs.encode() to encode to arbitrary types",
3433 encoding,
3434 Py_TYPE(v)->tp_name, Py_TYPE(v)->tp_name);
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003435 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003436 return NULL;
3437}
3438
Alexander Belopolsky40018472011-02-26 01:02:56 +00003439PyObject *
3440PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003441 const char *encoding,
3442 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003443{
3444 PyObject *v;
3445
3446 if (!PyUnicode_Check(unicode)) {
3447 PyErr_BadArgument();
3448 goto onError;
3449 }
3450
3451 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003452 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003453
3454 /* Encode via the codec registry */
3455 v = PyCodec_Encode(unicode, encoding, errors);
3456 if (v == NULL)
3457 goto onError;
3458 if (!PyUnicode_Check(v)) {
3459 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003460 "'%.400s' encoder returned '%.400s' instead of 'str'; "
3461 "use codecs.encode() to encode to arbitrary types",
3462 encoding,
3463 Py_TYPE(v)->tp_name, Py_TYPE(v)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003464 Py_DECREF(v);
3465 goto onError;
3466 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003467 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003468
Benjamin Peterson29060642009-01-31 22:14:21 +00003469 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003470 return NULL;
3471}
3472
Victor Stinner2f197072011-12-17 07:08:30 +01003473static size_t
3474mbstowcs_errorpos(const char *str, size_t len)
3475{
3476#ifdef HAVE_MBRTOWC
3477 const char *start = str;
3478 mbstate_t mbs;
3479 size_t converted;
3480 wchar_t ch;
3481
3482 memset(&mbs, 0, sizeof mbs);
3483 while (len)
3484 {
3485 converted = mbrtowc(&ch, (char*)str, len, &mbs);
3486 if (converted == 0)
3487 /* Reached end of string */
3488 break;
3489 if (converted == (size_t)-1 || converted == (size_t)-2) {
3490 /* Conversion error or incomplete character */
3491 return str - start;
3492 }
3493 else {
3494 str += converted;
3495 len -= converted;
3496 }
3497 }
3498 /* failed to find the undecodable byte sequence */
3499 return 0;
3500#endif
3501 return 0;
3502}
3503
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003504PyObject*
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003505PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
Victor Stinner1b579672011-12-17 05:47:23 +01003506 const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003507{
3508 wchar_t smallbuf[256];
3509 size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
3510 wchar_t *wstr;
3511 size_t wlen, wlen2;
3512 PyObject *unicode;
Victor Stinner1b579672011-12-17 05:47:23 +01003513 int surrogateescape;
Victor Stinner2f197072011-12-17 07:08:30 +01003514 size_t error_pos;
3515 char *errmsg;
3516 PyObject *reason, *exc;
Victor Stinner1b579672011-12-17 05:47:23 +01003517
3518 if (locale_error_handler(errors, &surrogateescape) < 0)
3519 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003520
3521 if (str[len] != '\0' || len != strlen(str)) {
3522 PyErr_SetString(PyExc_TypeError, "embedded null character");
3523 return NULL;
3524 }
3525
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003526 if (surrogateescape) {
3527 /* "surrogateescape" error handler */
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003528 wstr = _Py_char2wchar(str, &wlen);
3529 if (wstr == NULL) {
3530 if (wlen == (size_t)-1)
3531 PyErr_NoMemory();
3532 else
3533 PyErr_SetFromErrno(PyExc_OSError);
3534 return NULL;
3535 }
3536
3537 unicode = PyUnicode_FromWideChar(wstr, wlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003538 PyMem_RawFree(wstr);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003539 }
3540 else {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003541 /* strict mode */
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003542#ifndef HAVE_BROKEN_MBSTOWCS
3543 wlen = mbstowcs(NULL, str, 0);
3544#else
3545 wlen = len;
3546#endif
Victor Stinner2f197072011-12-17 07:08:30 +01003547 if (wlen == (size_t)-1)
3548 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003549 if (wlen+1 <= smallbuf_len) {
3550 wstr = smallbuf;
3551 }
3552 else {
3553 if (wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1)
3554 return PyErr_NoMemory();
3555
3556 wstr = PyMem_Malloc((wlen+1) * sizeof(wchar_t));
3557 if (!wstr)
3558 return PyErr_NoMemory();
3559 }
3560
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003561 wlen2 = mbstowcs(wstr, str, wlen+1);
3562 if (wlen2 == (size_t)-1) {
3563 if (wstr != smallbuf)
3564 PyMem_Free(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003565 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003566 }
3567#ifdef HAVE_BROKEN_MBSTOWCS
3568 assert(wlen2 == wlen);
3569#endif
3570 unicode = PyUnicode_FromWideChar(wstr, wlen2);
3571 if (wstr != smallbuf)
3572 PyMem_Free(wstr);
3573 }
3574 return unicode;
Victor Stinner2f197072011-12-17 07:08:30 +01003575
3576decode_error:
3577 errmsg = strerror(errno);
3578 assert(errmsg != NULL);
3579
3580 error_pos = mbstowcs_errorpos(str, len);
3581 if (errmsg != NULL) {
3582 size_t errlen;
3583 wstr = _Py_char2wchar(errmsg, &errlen);
3584 if (wstr != NULL) {
3585 reason = PyUnicode_FromWideChar(wstr, errlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003586 PyMem_RawFree(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003587 } else
3588 errmsg = NULL;
3589 }
3590 if (errmsg == NULL)
3591 reason = PyUnicode_FromString(
3592 "mbstowcs() encountered an invalid multibyte sequence");
3593 if (reason == NULL)
3594 return NULL;
3595
3596 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO",
3597 "locale", str, len,
3598 (Py_ssize_t)error_pos,
3599 (Py_ssize_t)(error_pos+1),
3600 reason);
3601 Py_DECREF(reason);
3602 if (exc != NULL) {
3603 PyCodec_StrictErrors(exc);
3604 Py_XDECREF(exc);
3605 }
3606 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003607}
3608
3609PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003610PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003611{
3612 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner1b579672011-12-17 05:47:23 +01003613 return PyUnicode_DecodeLocaleAndSize(str, size, errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003614}
3615
3616
3617PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003618PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003619 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003620 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3621}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003622
Christian Heimes5894ba72007-11-04 11:43:14 +00003623PyObject*
3624PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3625{
Victor Stinner99b95382011-07-04 14:23:54 +02003626#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003627 return PyUnicode_DecodeMBCS(s, size, NULL);
3628#elif defined(__APPLE__)
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003629 return PyUnicode_DecodeUTF8Stateful(s, size, "surrogateescape", NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003630#else
Victor Stinner793b5312011-04-27 00:24:21 +02003631 PyInterpreterState *interp = PyThreadState_GET()->interp;
3632 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3633 cannot use it to encode and decode filenames before it is loaded. Load
3634 the Python codec requires to encode at least its own filename. Use the C
3635 version of the locale codec until the codec registry is initialized and
3636 the Python codec is loaded.
3637
3638 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3639 cannot only rely on it: check also interp->fscodec_initialized for
3640 subinterpreters. */
3641 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003642 return PyUnicode_Decode(s, size,
3643 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00003644 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003645 }
3646 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003647 return PyUnicode_DecodeLocaleAndSize(s, size, "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003648 }
Victor Stinnerad158722010-10-27 00:25:46 +00003649#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003650}
3651
Martin v. Löwis011e8422009-05-05 04:43:17 +00003652
3653int
Victor Stinnerfe75fb42012-10-23 02:52:18 +02003654_PyUnicode_HasNULChars(PyObject* str)
Antoine Pitrou13348842012-01-29 18:36:34 +01003655{
Victor Stinnerfe75fb42012-10-23 02:52:18 +02003656 Py_ssize_t pos;
Antoine Pitrou13348842012-01-29 18:36:34 +01003657
Victor Stinnerfe75fb42012-10-23 02:52:18 +02003658 if (PyUnicode_READY(str) == -1)
Antoine Pitrou13348842012-01-29 18:36:34 +01003659 return -1;
Victor Stinnerfe75fb42012-10-23 02:52:18 +02003660 pos = findchar(PyUnicode_DATA(str), PyUnicode_KIND(str),
3661 PyUnicode_GET_LENGTH(str), '\0', 1);
3662 if (pos == -1)
3663 return 0;
3664 else
3665 return 1;
Antoine Pitrou13348842012-01-29 18:36:34 +01003666}
3667
Antoine Pitrou13348842012-01-29 18:36:34 +01003668int
Martin v. Löwis011e8422009-05-05 04:43:17 +00003669PyUnicode_FSConverter(PyObject* arg, void* addr)
3670{
3671 PyObject *output = NULL;
3672 Py_ssize_t size;
3673 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003674 if (arg == NULL) {
3675 Py_DECREF(*(PyObject**)addr);
3676 return 1;
3677 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00003678 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00003679 output = arg;
3680 Py_INCREF(output);
3681 }
3682 else {
3683 arg = PyUnicode_FromObject(arg);
3684 if (!arg)
3685 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00003686 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003687 Py_DECREF(arg);
3688 if (!output)
3689 return 0;
3690 if (!PyBytes_Check(output)) {
3691 Py_DECREF(output);
3692 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3693 return 0;
3694 }
3695 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00003696 size = PyBytes_GET_SIZE(output);
3697 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003698 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05003699 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003700 Py_DECREF(output);
3701 return 0;
3702 }
3703 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003704 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003705}
3706
3707
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003708int
3709PyUnicode_FSDecoder(PyObject* arg, void* addr)
3710{
3711 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003712 if (arg == NULL) {
3713 Py_DECREF(*(PyObject**)addr);
3714 return 1;
3715 }
3716 if (PyUnicode_Check(arg)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003717 if (PyUnicode_READY(arg) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003718 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003719 output = arg;
3720 Py_INCREF(output);
3721 }
3722 else {
3723 arg = PyBytes_FromObject(arg);
3724 if (!arg)
3725 return 0;
3726 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3727 PyBytes_GET_SIZE(arg));
3728 Py_DECREF(arg);
3729 if (!output)
3730 return 0;
3731 if (!PyUnicode_Check(output)) {
3732 Py_DECREF(output);
3733 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3734 return 0;
3735 }
3736 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003737 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003738 Py_DECREF(output);
3739 return 0;
3740 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003741 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003742 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003743 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3744 Py_DECREF(output);
3745 return 0;
3746 }
3747 *(PyObject**)addr = output;
3748 return Py_CLEANUP_SUPPORTED;
3749}
3750
3751
Martin v. Löwis5b222132007-06-10 09:51:05 +00003752char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003753PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003754{
Christian Heimesf3863112007-11-22 07:46:41 +00003755 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003756
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003757 if (!PyUnicode_Check(unicode)) {
3758 PyErr_BadArgument();
3759 return NULL;
3760 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003761 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003762 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003763
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003764 if (PyUnicode_UTF8(unicode) == NULL) {
3765 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003766 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3767 if (bytes == NULL)
3768 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003769 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3770 if (_PyUnicode_UTF8(unicode) == NULL) {
Victor Stinnera5afb582013-10-29 01:28:23 +01003771 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003772 Py_DECREF(bytes);
3773 return NULL;
3774 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003775 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3776 Py_MEMCPY(_PyUnicode_UTF8(unicode),
3777 PyBytes_AS_STRING(bytes),
3778 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003779 Py_DECREF(bytes);
3780 }
3781
3782 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003783 *psize = PyUnicode_UTF8_LENGTH(unicode);
3784 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003785}
3786
3787char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003788PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003789{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003790 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3791}
3792
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003793Py_UNICODE *
3794PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3795{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003796 const unsigned char *one_byte;
3797#if SIZEOF_WCHAR_T == 4
3798 const Py_UCS2 *two_bytes;
3799#else
3800 const Py_UCS4 *four_bytes;
3801 const Py_UCS4 *ucs4_end;
3802 Py_ssize_t num_surrogates;
3803#endif
3804 wchar_t *w;
3805 wchar_t *wchar_end;
3806
3807 if (!PyUnicode_Check(unicode)) {
3808 PyErr_BadArgument();
3809 return NULL;
3810 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003811 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003812 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003813 assert(_PyUnicode_KIND(unicode) != 0);
3814 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003815
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003816 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003817#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003818 four_bytes = PyUnicode_4BYTE_DATA(unicode);
3819 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003820 num_surrogates = 0;
3821
3822 for (; four_bytes < ucs4_end; ++four_bytes) {
3823 if (*four_bytes > 0xFFFF)
3824 ++num_surrogates;
3825 }
3826
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003827 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
3828 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
3829 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003830 PyErr_NoMemory();
3831 return NULL;
3832 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003833 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003834
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003835 w = _PyUnicode_WSTR(unicode);
3836 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
3837 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003838 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3839 if (*four_bytes > 0xFFFF) {
Victor Stinner8faf8212011-12-08 22:14:11 +01003840 assert(*four_bytes <= MAX_UNICODE);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003841 /* encode surrogate pair in this case */
Victor Stinner551ac952011-11-29 22:58:13 +01003842 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
3843 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003844 }
3845 else
3846 *w = *four_bytes;
3847
3848 if (w > wchar_end) {
3849 assert(0 && "Miscalculated string end");
3850 }
3851 }
3852 *w = 0;
3853#else
3854 /* sizeof(wchar_t) == 4 */
3855 Py_FatalError("Impossible unicode object state, wstr and str "
3856 "should share memory already.");
3857 return NULL;
3858#endif
3859 }
3860 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003861 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3862 (_PyUnicode_LENGTH(unicode) + 1));
3863 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003864 PyErr_NoMemory();
3865 return NULL;
3866 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003867 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
3868 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
3869 w = _PyUnicode_WSTR(unicode);
3870 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003871
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003872 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3873 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003874 for (; w < wchar_end; ++one_byte, ++w)
3875 *w = *one_byte;
3876 /* null-terminate the wstr */
3877 *w = 0;
3878 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003879 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003880#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003881 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003882 for (; w < wchar_end; ++two_bytes, ++w)
3883 *w = *two_bytes;
3884 /* null-terminate the wstr */
3885 *w = 0;
3886#else
3887 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003888 PyObject_FREE(_PyUnicode_WSTR(unicode));
3889 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003890 Py_FatalError("Impossible unicode object state, wstr "
3891 "and str should share memory already.");
3892 return NULL;
3893#endif
3894 }
3895 else {
3896 assert(0 && "This should never happen.");
3897 }
3898 }
3899 }
3900 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003901 *size = PyUnicode_WSTR_LENGTH(unicode);
3902 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003903}
3904
Alexander Belopolsky40018472011-02-26 01:02:56 +00003905Py_UNICODE *
3906PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003907{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003908 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003909}
3910
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003911
Alexander Belopolsky40018472011-02-26 01:02:56 +00003912Py_ssize_t
3913PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003914{
3915 if (!PyUnicode_Check(unicode)) {
3916 PyErr_BadArgument();
3917 goto onError;
3918 }
3919 return PyUnicode_GET_SIZE(unicode);
3920
Benjamin Peterson29060642009-01-31 22:14:21 +00003921 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003922 return -1;
3923}
3924
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003925Py_ssize_t
3926PyUnicode_GetLength(PyObject *unicode)
3927{
Victor Stinner07621332012-06-16 04:53:46 +02003928 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003929 PyErr_BadArgument();
3930 return -1;
3931 }
Victor Stinner07621332012-06-16 04:53:46 +02003932 if (PyUnicode_READY(unicode) == -1)
3933 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003934 return PyUnicode_GET_LENGTH(unicode);
3935}
3936
3937Py_UCS4
3938PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3939{
Victor Stinner69ed0f42013-04-09 21:48:24 +02003940 void *data;
3941 int kind;
3942
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003943 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3944 PyErr_BadArgument();
3945 return (Py_UCS4)-1;
3946 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01003947 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003948 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003949 return (Py_UCS4)-1;
3950 }
Victor Stinner69ed0f42013-04-09 21:48:24 +02003951 data = PyUnicode_DATA(unicode);
3952 kind = PyUnicode_KIND(unicode);
3953 return PyUnicode_READ(kind, data, index);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003954}
3955
3956int
3957PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3958{
3959 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003960 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003961 return -1;
3962 }
Victor Stinner488fa492011-12-12 00:01:39 +01003963 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01003964 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003965 PyErr_SetString(PyExc_IndexError, "string index out of range");
3966 return -1;
3967 }
Victor Stinner488fa492011-12-12 00:01:39 +01003968 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02003969 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01003970 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
3971 PyErr_SetString(PyExc_ValueError, "character out of range");
3972 return -1;
3973 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003974 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3975 index, ch);
3976 return 0;
3977}
3978
Alexander Belopolsky40018472011-02-26 01:02:56 +00003979const char *
3980PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00003981{
Victor Stinner42cb4622010-09-01 19:39:01 +00003982 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00003983}
3984
Victor Stinner554f3f02010-06-16 23:33:54 +00003985/* create or adjust a UnicodeDecodeError */
3986static void
3987make_decode_exception(PyObject **exceptionObject,
3988 const char *encoding,
3989 const char *input, Py_ssize_t length,
3990 Py_ssize_t startpos, Py_ssize_t endpos,
3991 const char *reason)
3992{
3993 if (*exceptionObject == NULL) {
3994 *exceptionObject = PyUnicodeDecodeError_Create(
3995 encoding, input, length, startpos, endpos, reason);
3996 }
3997 else {
3998 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
3999 goto onError;
4000 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4001 goto onError;
4002 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4003 goto onError;
4004 }
4005 return;
4006
4007onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02004008 Py_CLEAR(*exceptionObject);
Victor Stinner554f3f02010-06-16 23:33:54 +00004009}
4010
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004011#ifdef HAVE_MBCS
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004012/* error handling callback helper:
4013 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00004014 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004015 and adjust various state variables.
4016 return 0 on success, -1 on error
4017*/
4018
Alexander Belopolsky40018472011-02-26 01:02:56 +00004019static int
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004020unicode_decode_call_errorhandler_wchar(
4021 const char *errors, PyObject **errorHandler,
4022 const char *encoding, const char *reason,
4023 const char **input, const char **inend, Py_ssize_t *startinpos,
4024 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4025 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004026{
Benjamin Peterson142957c2008-07-04 19:55:29 +00004027 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004028
4029 PyObject *restuple = NULL;
4030 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01004031 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004032 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004033 Py_ssize_t requiredsize;
4034 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004035 PyObject *inputobj = NULL;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004036 wchar_t *repwstr;
4037 Py_ssize_t repwlen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004038
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004039 assert (_PyUnicode_KIND(*output) == PyUnicode_WCHAR_KIND);
4040 outsize = _PyUnicode_WSTR_LENGTH(*output);
Victor Stinner596a6c42011-11-09 00:02:18 +01004041
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004042 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004043 *errorHandler = PyCodec_LookupError(errors);
4044 if (*errorHandler == NULL)
4045 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004046 }
4047
Victor Stinner554f3f02010-06-16 23:33:54 +00004048 make_decode_exception(exceptionObject,
4049 encoding,
4050 *input, *inend - *input,
4051 *startinpos, *endinpos,
4052 reason);
4053 if (*exceptionObject == NULL)
4054 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004055
4056 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4057 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004058 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004059 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00004060 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004061 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004062 }
4063 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004064 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004065
4066 /* Copy back the bytes variables, which might have been modified by the
4067 callback */
4068 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4069 if (!inputobj)
4070 goto onError;
4071 if (!PyBytes_Check(inputobj)) {
4072 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
4073 }
4074 *input = PyBytes_AS_STRING(inputobj);
4075 insize = PyBytes_GET_SIZE(inputobj);
4076 *inend = *input + insize;
4077 /* we can DECREF safely, as the exception has another reference,
4078 so the object won't go away. */
4079 Py_DECREF(inputobj);
4080
4081 if (newpos<0)
4082 newpos = insize+newpos;
4083 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004084 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004085 goto onError;
4086 }
4087
4088 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4089 if (repwstr == NULL)
4090 goto onError;
4091 /* need more space? (at least enough for what we
4092 have+the replacement+the rest of the string (starting
4093 at the new input position), so we won't have to check space
4094 when there are no errors in the rest of the string) */
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004095 requiredsize = *outpos;
4096 if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4097 goto overflow;
4098 requiredsize += repwlen;
4099 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4100 goto overflow;
4101 requiredsize += insize - newpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004102 if (requiredsize > outsize) {
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004103 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004104 requiredsize = 2*outsize;
4105 if (unicode_resize(output, requiredsize) < 0)
4106 goto onError;
4107 }
4108 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4109 *outpos += repwlen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004110 *endinpos = newpos;
4111 *inptr = *input + newpos;
4112
4113 /* we made it! */
4114 Py_XDECREF(restuple);
4115 return 0;
4116
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004117 overflow:
4118 PyErr_SetString(PyExc_OverflowError,
4119 "decoded result is too long for a Python string");
4120
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004121 onError:
4122 Py_XDECREF(restuple);
4123 return -1;
4124}
4125#endif /* HAVE_MBCS */
4126
4127static int
4128unicode_decode_call_errorhandler_writer(
4129 const char *errors, PyObject **errorHandler,
4130 const char *encoding, const char *reason,
4131 const char **input, const char **inend, Py_ssize_t *startinpos,
4132 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4133 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4134{
4135 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
4136
4137 PyObject *restuple = NULL;
4138 PyObject *repunicode = NULL;
4139 Py_ssize_t insize;
4140 Py_ssize_t newpos;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004141 Py_ssize_t replen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004142 PyObject *inputobj = NULL;
4143
4144 if (*errorHandler == NULL) {
4145 *errorHandler = PyCodec_LookupError(errors);
4146 if (*errorHandler == NULL)
4147 goto onError;
4148 }
4149
4150 make_decode_exception(exceptionObject,
4151 encoding,
4152 *input, *inend - *input,
4153 *startinpos, *endinpos,
4154 reason);
4155 if (*exceptionObject == NULL)
4156 goto onError;
4157
4158 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4159 if (restuple == NULL)
4160 goto onError;
4161 if (!PyTuple_Check(restuple)) {
4162 PyErr_SetString(PyExc_TypeError, &argparse[4]);
4163 goto onError;
4164 }
4165 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004166 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004167
4168 /* Copy back the bytes variables, which might have been modified by the
4169 callback */
4170 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4171 if (!inputobj)
4172 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00004173 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004174 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00004175 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004176 *input = PyBytes_AS_STRING(inputobj);
4177 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004178 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004179 /* we can DECREF safely, as the exception has another reference,
4180 so the object won't go away. */
4181 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004182
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004183 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004184 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004185 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004186 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00004187 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004188 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004189
Victor Stinner8f674cc2013-04-17 23:02:17 +02004190 if (PyUnicode_READY(repunicode) < 0)
4191 goto onError;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004192 replen = PyUnicode_GET_LENGTH(repunicode);
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004193 if (replen > 1) {
4194 writer->min_length += replen - 1;
Victor Stinner8f674cc2013-04-17 23:02:17 +02004195 writer->overallocate = 1;
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004196 if (_PyUnicodeWriter_Prepare(writer, writer->min_length,
4197 PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4198 goto onError;
4199 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004200 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
Victor Stinner376cfa12013-04-17 23:58:16 +02004201 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004202
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004203 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004204 *inptr = *input + newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004205
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004206 /* we made it! */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004207 Py_XDECREF(restuple);
4208 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004209
Benjamin Peterson29060642009-01-31 22:14:21 +00004210 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004211 Py_XDECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004212 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004213}
4214
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004215/* --- UTF-7 Codec -------------------------------------------------------- */
4216
Antoine Pitrou244651a2009-05-04 18:56:13 +00004217/* See RFC2152 for details. We encode conservatively and decode liberally. */
4218
4219/* Three simple macros defining base-64. */
4220
4221/* Is c a base-64 character? */
4222
4223#define IS_BASE64(c) \
4224 (((c) >= 'A' && (c) <= 'Z') || \
4225 ((c) >= 'a' && (c) <= 'z') || \
4226 ((c) >= '0' && (c) <= '9') || \
4227 (c) == '+' || (c) == '/')
4228
4229/* given that c is a base-64 character, what is its base-64 value? */
4230
4231#define FROM_BASE64(c) \
4232 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4233 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4234 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4235 (c) == '+' ? 62 : 63)
4236
4237/* What is the base-64 character of the bottom 6 bits of n? */
4238
4239#define TO_BASE64(n) \
4240 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4241
4242/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4243 * decoded as itself. We are permissive on decoding; the only ASCII
4244 * byte not decoding to itself is the + which begins a base64
4245 * string. */
4246
4247#define DECODE_DIRECT(c) \
4248 ((c) <= 127 && (c) != '+')
4249
4250/* The UTF-7 encoder treats ASCII characters differently according to
4251 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4252 * the above). See RFC2152. This array identifies these different
4253 * sets:
4254 * 0 : "Set D"
4255 * alphanumeric and '(),-./:?
4256 * 1 : "Set O"
4257 * !"#$%&*;<=>@[]^_`{|}
4258 * 2 : "whitespace"
4259 * ht nl cr sp
4260 * 3 : special (must be base64 encoded)
4261 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4262 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004263
Tim Petersced69f82003-09-16 20:30:58 +00004264static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004265char utf7_category[128] = {
4266/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4267 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4268/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4269 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4270/* sp ! " # $ % & ' ( ) * + , - . / */
4271 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4272/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4273 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4274/* @ A B C D E F G H I J K L M N O */
4275 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4276/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4277 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4278/* ` a b c d e f g h i j k l m n o */
4279 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4280/* p q r s t u v w x y z { | } ~ del */
4281 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004282};
4283
Antoine Pitrou244651a2009-05-04 18:56:13 +00004284/* ENCODE_DIRECT: this character should be encoded as itself. The
4285 * answer depends on whether we are encoding set O as itself, and also
4286 * on whether we are encoding whitespace as itself. RFC2152 makes it
4287 * clear that the answers to these questions vary between
4288 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004289
Antoine Pitrou244651a2009-05-04 18:56:13 +00004290#define ENCODE_DIRECT(c, directO, directWS) \
4291 ((c) < 128 && (c) > 0 && \
4292 ((utf7_category[(c)] == 0) || \
4293 (directWS && (utf7_category[(c)] == 2)) || \
4294 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004295
Alexander Belopolsky40018472011-02-26 01:02:56 +00004296PyObject *
4297PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004298 Py_ssize_t size,
4299 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004300{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004301 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4302}
4303
Antoine Pitrou244651a2009-05-04 18:56:13 +00004304/* The decoder. The only state we preserve is our read position,
4305 * i.e. how many characters we have consumed. So if we end in the
4306 * middle of a shift sequence we have to back off the read position
4307 * and the output to the beginning of the sequence, otherwise we lose
4308 * all the shift state (seen bits, number of bits seen, high
4309 * surrogate). */
4310
Alexander Belopolsky40018472011-02-26 01:02:56 +00004311PyObject *
4312PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004313 Py_ssize_t size,
4314 const char *errors,
4315 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004316{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004317 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004318 Py_ssize_t startinpos;
4319 Py_ssize_t endinpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004320 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004321 _PyUnicodeWriter writer;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004322 const char *errmsg = "";
4323 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004324 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004325 unsigned int base64bits = 0;
4326 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004327 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004328 PyObject *errorHandler = NULL;
4329 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004330
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004331 if (size == 0) {
4332 if (consumed)
4333 *consumed = 0;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004334 _Py_RETURN_UNICODE_EMPTY();
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004335 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004336
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004337 /* Start off assuming it's all ASCII. Widen later as necessary. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02004338 _PyUnicodeWriter_Init(&writer);
4339 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004340
4341 shiftOutStart = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004342 e = s + size;
4343
4344 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004345 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004346 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004347 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004348
Antoine Pitrou244651a2009-05-04 18:56:13 +00004349 if (inShift) { /* in a base-64 section */
4350 if (IS_BASE64(ch)) { /* consume a base-64 character */
4351 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4352 base64bits += 6;
4353 s++;
4354 if (base64bits >= 16) {
4355 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004356 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004357 base64bits -= 16;
4358 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004359 assert(outCh <= 0xffff);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004360 if (surrogate) {
4361 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004362 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4363 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004364 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004365 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004366 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004367 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004368 }
4369 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004370 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004371 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004372 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004373 }
4374 }
Victor Stinner551ac952011-11-29 22:58:13 +01004375 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004376 /* first surrogate */
4377 surrogate = outCh;
4378 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004379 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004380 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004381 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004382 }
4383 }
4384 }
4385 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004386 inShift = 0;
4387 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004388 if (surrogate) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004389 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004390 goto onError;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004391 surrogate = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004392 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004393 if (base64bits > 0) { /* left-over bits */
4394 if (base64bits >= 6) {
4395 /* We've seen at least one base-64 character */
4396 errmsg = "partial character in shift sequence";
4397 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004398 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004399 else {
4400 /* Some bits remain; they should be zero */
4401 if (base64buffer != 0) {
4402 errmsg = "non-zero padding bits in shift sequence";
4403 goto utf7Error;
4404 }
4405 }
4406 }
4407 if (ch != '-') {
4408 /* '-' is absorbed; other terminating
4409 characters are preserved */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004410 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004411 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004412 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004413 }
4414 }
4415 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004416 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004417 s++; /* consume '+' */
4418 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004419 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004420 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004421 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004422 }
4423 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004424 inShift = 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004425 shiftOutStart = writer.pos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004426 base64bits = 0;
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004427 base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004428 }
4429 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004430 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004431 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004432 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004433 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004434 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004435 else {
4436 startinpos = s-starts;
4437 s++;
4438 errmsg = "unexpected special character";
4439 goto utf7Error;
4440 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004441 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004442utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004443 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004444 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004445 errors, &errorHandler,
4446 "utf7", errmsg,
4447 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004448 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004449 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004450 }
4451
Antoine Pitrou244651a2009-05-04 18:56:13 +00004452 /* end of string */
4453
4454 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4455 /* if we're in an inconsistent state, that's an error */
4456 if (surrogate ||
4457 (base64bits >= 6) ||
4458 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004459 endinpos = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004460 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrou244651a2009-05-04 18:56:13 +00004461 errors, &errorHandler,
4462 "utf7", "unterminated shift sequence",
4463 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004464 &writer))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004465 goto onError;
4466 if (s < e)
4467 goto restart;
4468 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004469 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004470
4471 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004472 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004473 if (inShift) {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004474 *consumed = startinpos;
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004475 if (writer.pos != shiftOutStart && writer.maxchar > 127) {
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004476 PyObject *result = PyUnicode_FromKindAndData(
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004477 writer.kind, writer.data, shiftOutStart);
4478 Py_XDECREF(errorHandler);
4479 Py_XDECREF(exc);
4480 _PyUnicodeWriter_Dealloc(&writer);
4481 return result;
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004482 }
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004483 writer.pos = shiftOutStart; /* back off output */
Antoine Pitrou244651a2009-05-04 18:56:13 +00004484 }
4485 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004486 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004487 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004488 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004489
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004490 Py_XDECREF(errorHandler);
4491 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004492 return _PyUnicodeWriter_Finish(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004493
Benjamin Peterson29060642009-01-31 22:14:21 +00004494 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004495 Py_XDECREF(errorHandler);
4496 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004497 _PyUnicodeWriter_Dealloc(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004498 return NULL;
4499}
4500
4501
Alexander Belopolsky40018472011-02-26 01:02:56 +00004502PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004503_PyUnicode_EncodeUTF7(PyObject *str,
4504 int base64SetO,
4505 int base64WhiteSpace,
4506 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004507{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004508 int kind;
4509 void *data;
4510 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004511 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004512 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004513 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004514 unsigned int base64bits = 0;
4515 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004516 char * out;
4517 char * start;
4518
Benjamin Petersonbac79492012-01-14 13:34:47 -05004519 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004520 return NULL;
4521 kind = PyUnicode_KIND(str);
4522 data = PyUnicode_DATA(str);
4523 len = PyUnicode_GET_LENGTH(str);
4524
4525 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004526 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004527
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004528 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004529 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004530 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004531 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004532 if (v == NULL)
4533 return NULL;
4534
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004535 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004536 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004537 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004538
Antoine Pitrou244651a2009-05-04 18:56:13 +00004539 if (inShift) {
4540 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4541 /* shifting out */
4542 if (base64bits) { /* output remaining bits */
4543 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4544 base64buffer = 0;
4545 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004546 }
4547 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004548 /* Characters not in the BASE64 set implicitly unshift the sequence
4549 so no '-' is required, except if the character is itself a '-' */
4550 if (IS_BASE64(ch) || ch == '-') {
4551 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004552 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004553 *out++ = (char) ch;
4554 }
4555 else {
4556 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004557 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004558 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004559 else { /* not in a shift sequence */
4560 if (ch == '+') {
4561 *out++ = '+';
4562 *out++ = '-';
4563 }
4564 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4565 *out++ = (char) ch;
4566 }
4567 else {
4568 *out++ = '+';
4569 inShift = 1;
4570 goto encode_char;
4571 }
4572 }
4573 continue;
4574encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004575 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004576 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004577
Antoine Pitrou244651a2009-05-04 18:56:13 +00004578 /* code first surrogate */
4579 base64bits += 16;
Victor Stinner76df43d2012-10-30 01:42:39 +01004580 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004581 while (base64bits >= 6) {
4582 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4583 base64bits -= 6;
4584 }
4585 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004586 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004587 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004588 base64bits += 16;
4589 base64buffer = (base64buffer << 16) | ch;
4590 while (base64bits >= 6) {
4591 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4592 base64bits -= 6;
4593 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004594 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004595 if (base64bits)
4596 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4597 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004598 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004599 if (_PyBytes_Resize(&v, out - start) < 0)
4600 return NULL;
4601 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004602}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004603PyObject *
4604PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4605 Py_ssize_t size,
4606 int base64SetO,
4607 int base64WhiteSpace,
4608 const char *errors)
4609{
4610 PyObject *result;
4611 PyObject *tmp = PyUnicode_FromUnicode(s, size);
4612 if (tmp == NULL)
4613 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004614 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004615 base64WhiteSpace, errors);
4616 Py_DECREF(tmp);
4617 return result;
4618}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004619
Antoine Pitrou244651a2009-05-04 18:56:13 +00004620#undef IS_BASE64
4621#undef FROM_BASE64
4622#undef TO_BASE64
4623#undef DECODE_DIRECT
4624#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004625
Guido van Rossumd57fd912000-03-10 22:53:23 +00004626/* --- UTF-8 Codec -------------------------------------------------------- */
4627
Alexander Belopolsky40018472011-02-26 01:02:56 +00004628PyObject *
4629PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004630 Py_ssize_t size,
4631 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004632{
Walter Dörwald69652032004-09-07 20:24:22 +00004633 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4634}
4635
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004636#include "stringlib/asciilib.h"
4637#include "stringlib/codecs.h"
4638#include "stringlib/undef.h"
4639
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004640#include "stringlib/ucs1lib.h"
4641#include "stringlib/codecs.h"
4642#include "stringlib/undef.h"
4643
4644#include "stringlib/ucs2lib.h"
4645#include "stringlib/codecs.h"
4646#include "stringlib/undef.h"
4647
4648#include "stringlib/ucs4lib.h"
4649#include "stringlib/codecs.h"
4650#include "stringlib/undef.h"
4651
Antoine Pitrouab868312009-01-10 15:40:25 +00004652/* Mask to quickly check whether a C 'long' contains a
4653 non-ASCII, UTF8-encoded char. */
4654#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004655# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004656#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004657# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004658#else
4659# error C 'long' size should be either 4 or 8!
4660#endif
4661
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004662static Py_ssize_t
4663ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004664{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004665 const char *p = start;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004666 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004667
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004668 /*
4669 * Issue #17237: m68k is a bit different from most architectures in
4670 * that objects do not use "natural alignment" - for example, int and
4671 * long are only aligned at 2-byte boundaries. Therefore the assert()
4672 * won't work; also, tests have shown that skipping the "optimised
4673 * version" will even speed up m68k.
4674 */
4675#if !defined(__m68k__)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004676#if SIZEOF_LONG <= SIZEOF_VOID_P
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004677 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4678 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004679 /* Fast path, see in STRINGLIB(utf8_decode) for
4680 an explanation. */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004681 /* Help allocation */
4682 const char *_p = p;
4683 Py_UCS1 * q = dest;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004684 while (_p < aligned_end) {
4685 unsigned long value = *(const unsigned long *) _p;
4686 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00004687 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004688 *((unsigned long *)q) = value;
4689 _p += SIZEOF_LONG;
4690 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004691 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004692 p = _p;
4693 while (p < end) {
4694 if ((unsigned char)*p & 0x80)
4695 break;
4696 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004697 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004698 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004699 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004700#endif
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004701#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004702 while (p < end) {
4703 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4704 for an explanation. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004705 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004706 /* Help allocation */
4707 const char *_p = p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004708 while (_p < aligned_end) {
4709 unsigned long value = *(unsigned long *) _p;
4710 if (value & ASCII_CHAR_MASK)
4711 break;
4712 _p += SIZEOF_LONG;
4713 }
4714 p = _p;
4715 if (_p == end)
4716 break;
4717 }
4718 if ((unsigned char)*p & 0x80)
4719 break;
4720 ++p;
4721 }
4722 memcpy(dest, start, p - start);
4723 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004724}
Antoine Pitrouab868312009-01-10 15:40:25 +00004725
Victor Stinner785938e2011-12-11 20:09:03 +01004726PyObject *
4727PyUnicode_DecodeUTF8Stateful(const char *s,
4728 Py_ssize_t size,
4729 const char *errors,
4730 Py_ssize_t *consumed)
4731{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004732 _PyUnicodeWriter writer;
Victor Stinner785938e2011-12-11 20:09:03 +01004733 const char *starts = s;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004734 const char *end = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004735
4736 Py_ssize_t startinpos;
4737 Py_ssize_t endinpos;
4738 const char *errmsg = "";
4739 PyObject *errorHandler = NULL;
4740 PyObject *exc = NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004741
4742 if (size == 0) {
4743 if (consumed)
4744 *consumed = 0;
Serhiy Storchaka678db842013-01-26 12:16:36 +02004745 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner785938e2011-12-11 20:09:03 +01004746 }
4747
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004748 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4749 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner785938e2011-12-11 20:09:03 +01004750 if (consumed)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004751 *consumed = 1;
4752 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01004753 }
4754
Victor Stinner8f674cc2013-04-17 23:02:17 +02004755 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02004756 writer.min_length = size;
4757 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004758 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004759
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004760 writer.pos = ascii_decode(s, end, writer.data);
4761 s += writer.pos;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004762 while (s < end) {
4763 Py_UCS4 ch;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004764 int kind = writer.kind;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004765 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004766 if (PyUnicode_IS_ASCII(writer.buffer))
4767 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004768 else
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004769 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004770 } else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004771 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004772 } else {
4773 assert(kind == PyUnicode_4BYTE_KIND);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004774 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004775 }
4776
4777 switch (ch) {
4778 case 0:
4779 if (s == end || consumed)
4780 goto End;
4781 errmsg = "unexpected end of data";
4782 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004783 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004784 break;
4785 case 1:
4786 errmsg = "invalid start byte";
4787 startinpos = s - starts;
4788 endinpos = startinpos + 1;
4789 break;
4790 case 2:
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004791 case 3:
4792 case 4:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004793 errmsg = "invalid continuation byte";
4794 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004795 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004796 break;
4797 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004798 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004799 goto onError;
4800 continue;
4801 }
4802
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004803 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004804 errors, &errorHandler,
4805 "utf-8", errmsg,
4806 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004807 &writer))
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004808 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004809 }
4810
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004811End:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004812 if (consumed)
4813 *consumed = s - starts;
4814
4815 Py_XDECREF(errorHandler);
4816 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004817 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004818
4819onError:
4820 Py_XDECREF(errorHandler);
4821 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004822 _PyUnicodeWriter_Dealloc(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004823 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004824}
4825
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004826#ifdef __APPLE__
4827
4828/* Simplified UTF-8 decoder using surrogateescape error handler,
Victor Stinner0d92c4f2012-11-12 23:32:21 +01004829 used to decode the command line arguments on Mac OS X.
4830
4831 Return a pointer to a newly allocated wide character string (use
Victor Stinner6f8eeee2013-07-07 22:57:45 +02004832 PyMem_RawFree() to free the memory), or NULL on memory allocation error. */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004833
4834wchar_t*
4835_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4836{
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004837 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004838 wchar_t *unicode;
4839 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004840
4841 /* Note: size will always be longer than the resulting Unicode
4842 character count */
Victor Stinner0d92c4f2012-11-12 23:32:21 +01004843 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1))
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004844 return NULL;
Victor Stinner6f8eeee2013-07-07 22:57:45 +02004845 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004846 if (!unicode)
4847 return NULL;
4848
4849 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004850 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004851 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004852 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004853 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004854#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004855 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004856#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004857 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004858#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004859 if (ch > 0xFF) {
4860#if SIZEOF_WCHAR_T == 4
4861 assert(0);
4862#else
4863 assert(Py_UNICODE_IS_SURROGATE(ch));
4864 /* compute and append the two surrogates: */
4865 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
4866 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
4867#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004868 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004869 else {
4870 if (!ch && s == e)
4871 break;
4872 /* surrogateescape */
4873 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
4874 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004875 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004876 unicode[outpos] = L'\0';
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004877 return unicode;
4878}
4879
4880#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00004881
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004882/* Primary internal function which creates utf8 encoded bytes objects.
4883
4884 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00004885 and allocate exactly as much space needed at the end. Else allocate the
4886 maximum possible needed (4 result bytes per Unicode character), and return
4887 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004888*/
Tim Peters7e3d9612002-04-21 03:26:37 +00004889PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01004890_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004891{
Victor Stinner6099a032011-12-18 14:22:26 +01004892 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004893 void *data;
4894 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00004895
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004896 if (!PyUnicode_Check(unicode)) {
4897 PyErr_BadArgument();
4898 return NULL;
4899 }
4900
4901 if (PyUnicode_READY(unicode) == -1)
4902 return NULL;
4903
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004904 if (PyUnicode_UTF8(unicode))
4905 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4906 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004907
4908 kind = PyUnicode_KIND(unicode);
4909 data = PyUnicode_DATA(unicode);
4910 size = PyUnicode_GET_LENGTH(unicode);
4911
Benjamin Petersonead6b532011-12-20 17:23:42 -06004912 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01004913 default:
4914 assert(0);
4915 case PyUnicode_1BYTE_KIND:
4916 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
4917 assert(!PyUnicode_IS_ASCII(unicode));
4918 return ucs1lib_utf8_encoder(unicode, data, size, errors);
4919 case PyUnicode_2BYTE_KIND:
4920 return ucs2lib_utf8_encoder(unicode, data, size, errors);
4921 case PyUnicode_4BYTE_KIND:
4922 return ucs4lib_utf8_encoder(unicode, data, size, errors);
Tim Peters602f7402002-04-27 18:03:26 +00004923 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004924}
4925
Alexander Belopolsky40018472011-02-26 01:02:56 +00004926PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004927PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4928 Py_ssize_t size,
4929 const char *errors)
4930{
4931 PyObject *v, *unicode;
4932
4933 unicode = PyUnicode_FromUnicode(s, size);
4934 if (unicode == NULL)
4935 return NULL;
4936 v = _PyUnicode_AsUTF8String(unicode, errors);
4937 Py_DECREF(unicode);
4938 return v;
4939}
4940
4941PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004942PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004943{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004944 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004945}
4946
Walter Dörwald41980ca2007-08-16 21:55:45 +00004947/* --- UTF-32 Codec ------------------------------------------------------- */
4948
4949PyObject *
4950PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004951 Py_ssize_t size,
4952 const char *errors,
4953 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004954{
4955 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4956}
4957
4958PyObject *
4959PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004960 Py_ssize_t size,
4961 const char *errors,
4962 int *byteorder,
4963 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004964{
4965 const char *starts = s;
4966 Py_ssize_t startinpos;
4967 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004968 _PyUnicodeWriter writer;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004969 const unsigned char *q, *e;
Victor Stinnere64322e2012-10-30 23:12:47 +01004970 int le, bo = 0; /* assume native ordering by default */
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02004971 const char *encoding;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004972 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004973 PyObject *errorHandler = NULL;
4974 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004975
Walter Dörwald41980ca2007-08-16 21:55:45 +00004976 q = (unsigned char *)s;
4977 e = q + size;
4978
4979 if (byteorder)
4980 bo = *byteorder;
4981
4982 /* Check for BOM marks (U+FEFF) in the input and adjust current
4983 byte order setting accordingly. In native mode, the leading BOM
4984 mark is skipped, in all other modes, it is copied to the output
4985 stream as-is (giving a ZWNBSP character). */
Victor Stinnere64322e2012-10-30 23:12:47 +01004986 if (bo == 0 && size >= 4) {
4987 Py_UCS4 bom = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
4988 if (bom == 0x0000FEFF) {
4989 bo = -1;
4990 q += 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00004991 }
Victor Stinnere64322e2012-10-30 23:12:47 +01004992 else if (bom == 0xFFFE0000) {
4993 bo = 1;
4994 q += 4;
4995 }
4996 if (byteorder)
4997 *byteorder = bo;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004998 }
4999
Victor Stinnere64322e2012-10-30 23:12:47 +01005000 if (q == e) {
5001 if (consumed)
5002 *consumed = size;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005003 _Py_RETURN_UNICODE_EMPTY();
Walter Dörwald41980ca2007-08-16 21:55:45 +00005004 }
5005
Victor Stinnere64322e2012-10-30 23:12:47 +01005006#ifdef WORDS_BIGENDIAN
5007 le = bo < 0;
5008#else
5009 le = bo <= 0;
5010#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005011 encoding = le ? "utf-32-le" : "utf-32-be";
Victor Stinnere64322e2012-10-30 23:12:47 +01005012
Victor Stinner8f674cc2013-04-17 23:02:17 +02005013 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005014 writer.min_length = (e - q + 3) / 4;
5015 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005016 goto onError;
Victor Stinnere64322e2012-10-30 23:12:47 +01005017
Victor Stinnere64322e2012-10-30 23:12:47 +01005018 while (1) {
5019 Py_UCS4 ch = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005020 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005021
Victor Stinnere64322e2012-10-30 23:12:47 +01005022 if (e - q >= 4) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005023 enum PyUnicode_Kind kind = writer.kind;
5024 void *data = writer.data;
Victor Stinnere64322e2012-10-30 23:12:47 +01005025 const unsigned char *last = e - 4;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005026 Py_ssize_t pos = writer.pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005027 if (le) {
5028 do {
5029 ch = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5030 if (ch > maxch)
5031 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005032 if (kind != PyUnicode_1BYTE_KIND &&
5033 Py_UNICODE_IS_SURROGATE(ch))
5034 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005035 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005036 q += 4;
5037 } while (q <= last);
5038 }
5039 else {
5040 do {
5041 ch = (q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
5042 if (ch > maxch)
5043 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005044 if (kind != PyUnicode_1BYTE_KIND &&
5045 Py_UNICODE_IS_SURROGATE(ch))
5046 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005047 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005048 q += 4;
5049 } while (q <= last);
5050 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005051 writer.pos = pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005052 }
5053
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005054 if (Py_UNICODE_IS_SURROGATE(ch)) {
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005055 errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005056 startinpos = ((const char *)q) - starts;
5057 endinpos = startinpos + 4;
5058 }
5059 else if (ch <= maxch) {
Victor Stinnere64322e2012-10-30 23:12:47 +01005060 if (q == e || consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005061 break;
Victor Stinnere64322e2012-10-30 23:12:47 +01005062 /* remaining bytes at the end? (size should be divisible by 4) */
Benjamin Peterson29060642009-01-31 22:14:21 +00005063 errmsg = "truncated data";
Victor Stinnere64322e2012-10-30 23:12:47 +01005064 startinpos = ((const char *)q) - starts;
5065 endinpos = ((const char *)e) - starts;
Benjamin Peterson29060642009-01-31 22:14:21 +00005066 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005067 else {
5068 if (ch < 0x110000) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005069 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnere64322e2012-10-30 23:12:47 +01005070 goto onError;
5071 q += 4;
5072 continue;
5073 }
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005074 errmsg = "code point not in range(0x110000)";
Victor Stinnere64322e2012-10-30 23:12:47 +01005075 startinpos = ((const char *)q) - starts;
5076 endinpos = startinpos + 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005077 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005078
5079 /* The remaining input chars are ignored if the callback
5080 chooses to skip the input */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005081 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005082 errors, &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005083 encoding, errmsg,
Benjamin Peterson29060642009-01-31 22:14:21 +00005084 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005085 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005086 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005087 }
5088
Walter Dörwald41980ca2007-08-16 21:55:45 +00005089 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005090 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005091
Walter Dörwald41980ca2007-08-16 21:55:45 +00005092 Py_XDECREF(errorHandler);
5093 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005094 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005095
Benjamin Peterson29060642009-01-31 22:14:21 +00005096 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005097 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005098 Py_XDECREF(errorHandler);
5099 Py_XDECREF(exc);
5100 return NULL;
5101}
5102
5103PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005104_PyUnicode_EncodeUTF32(PyObject *str,
5105 const char *errors,
5106 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005107{
Serhiy Storchaka30793282014-01-04 22:44:01 +02005108 int kind;
5109 void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005110 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005111 PyObject *v;
Serhiy Storchaka30793282014-01-04 22:44:01 +02005112 unsigned char *p;
5113 Py_ssize_t nsize, i;
5114 /* Offsets from p for storing byte pairs in the right order. */
Christian Heimes743e0cd2012-10-17 23:52:17 +02005115#if PY_LITTLE_ENDIAN
Serhiy Storchaka30793282014-01-04 22:44:01 +02005116 int iorder[] = {0, 1, 2, 3};
Walter Dörwald41980ca2007-08-16 21:55:45 +00005117#else
Serhiy Storchaka30793282014-01-04 22:44:01 +02005118 int iorder[] = {3, 2, 1, 0};
Walter Dörwald41980ca2007-08-16 21:55:45 +00005119#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005120 const char *encoding;
5121 PyObject *errorHandler = NULL;
5122 PyObject *exc = NULL;
5123 PyObject *rep = NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005124
Serhiy Storchaka30793282014-01-04 22:44:01 +02005125#define STORECHAR(CH) \
5126 do { \
5127 p[iorder[3]] = ((CH) >> 24) & 0xff; \
5128 p[iorder[2]] = ((CH) >> 16) & 0xff; \
5129 p[iorder[1]] = ((CH) >> 8) & 0xff; \
5130 p[iorder[0]] = (CH) & 0xff; \
5131 p += 4; \
5132 } while(0)
5133
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005134 if (!PyUnicode_Check(str)) {
5135 PyErr_BadArgument();
5136 return NULL;
5137 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005138 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005139 return NULL;
5140 kind = PyUnicode_KIND(str);
5141 data = PyUnicode_DATA(str);
5142 len = PyUnicode_GET_LENGTH(str);
5143
Serhiy Storchaka583a9392014-01-04 19:25:37 +02005144 nsize = len + (byteorder == 0);
Serhiy Storchaka30793282014-01-04 22:44:01 +02005145 if (nsize > PY_SSIZE_T_MAX / 4)
5146 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005147 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005148 if (v == NULL)
5149 return NULL;
5150
Serhiy Storchaka30793282014-01-04 22:44:01 +02005151 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005152 if (byteorder == 0)
Serhiy Storchaka30793282014-01-04 22:44:01 +02005153 STORECHAR(0xFEFF);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005154 if (len == 0)
Serhiy Storchaka30793282014-01-04 22:44:01 +02005155 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005156
Serhiy Storchaka30793282014-01-04 22:44:01 +02005157 if (byteorder == -1) {
5158 /* force LE */
5159 iorder[0] = 0;
5160 iorder[1] = 1;
5161 iorder[2] = 2;
5162 iorder[3] = 3;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005163 encoding = "utf-32-le";
Serhiy Storchaka30793282014-01-04 22:44:01 +02005164 }
5165 else if (byteorder == 1) {
5166 /* force BE */
5167 iorder[0] = 3;
5168 iorder[1] = 2;
5169 iorder[2] = 1;
5170 iorder[3] = 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005171 encoding = "utf-32-be";
Serhiy Storchaka30793282014-01-04 22:44:01 +02005172 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005173 else
5174 encoding = "utf-32";
5175
5176 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchaka30793282014-01-04 22:44:01 +02005177 for (i = 0; i < len; i++)
5178 STORECHAR(PyUnicode_READ(kind, data, i));
5179 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005180 }
5181
Serhiy Storchaka30793282014-01-04 22:44:01 +02005182 for (i = 0; i < len;) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005183 Py_ssize_t repsize, moreunits;
Serhiy Storchaka30793282014-01-04 22:44:01 +02005184 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
5185 i++;
5186 assert(ch <= MAX_UNICODE);
5187 if (!Py_UNICODE_IS_SURROGATE(ch)) {
5188 STORECHAR(ch);
5189 continue;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005190 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005191
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005192 rep = unicode_encode_call_errorhandler(
5193 errors, &errorHandler,
5194 encoding, "surrogates not allowed",
Serhiy Storchaka30793282014-01-04 22:44:01 +02005195 str, &exc, i-1, i, &i);
5196
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005197 if (!rep)
5198 goto error;
5199
5200 if (PyBytes_Check(rep)) {
5201 repsize = PyBytes_GET_SIZE(rep);
5202 if (repsize & 3) {
5203 raise_encode_exception(&exc, encoding,
Serhiy Storchaka30793282014-01-04 22:44:01 +02005204 str, i - 1, i,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005205 "surrogates not allowed");
5206 goto error;
5207 }
5208 moreunits = repsize / 4;
5209 }
5210 else {
5211 assert(PyUnicode_Check(rep));
5212 if (PyUnicode_READY(rep) < 0)
5213 goto error;
5214 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5215 if (!PyUnicode_IS_ASCII(rep)) {
5216 raise_encode_exception(&exc, encoding,
Serhiy Storchaka30793282014-01-04 22:44:01 +02005217 str, i - 1, i,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005218 "surrogates not allowed");
5219 goto error;
5220 }
5221 }
5222
5223 /* four bytes are reserved for each surrogate */
5224 if (moreunits > 1) {
Serhiy Storchaka30793282014-01-04 22:44:01 +02005225 Py_ssize_t outpos = p - (unsigned char*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005226 Py_ssize_t morebytes = 4 * (moreunits - 1);
5227 if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) {
5228 /* integer overflow */
5229 PyErr_NoMemory();
5230 goto error;
5231 }
5232 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0)
5233 goto error;
Serhiy Storchaka30793282014-01-04 22:44:01 +02005234 p = (unsigned char*) PyBytes_AS_STRING(v) + outpos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005235 }
5236
5237 if (PyBytes_Check(rep)) {
Serhiy Storchaka30793282014-01-04 22:44:01 +02005238 Py_MEMCPY(p, PyBytes_AS_STRING(rep), repsize);
5239 p += repsize;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005240 } else /* rep is unicode */ {
Serhiy Storchaka30793282014-01-04 22:44:01 +02005241 const Py_UCS1 *repdata;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005242 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
Serhiy Storchaka30793282014-01-04 22:44:01 +02005243 repdata = PyUnicode_1BYTE_DATA(rep);
5244 while (repsize--) {
5245 Py_UCS4 ch = *repdata++;
5246 STORECHAR(ch);
5247 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005248 }
5249
5250 Py_CLEAR(rep);
5251 }
5252
5253 /* Cut back to size actually needed. This is necessary for, for example,
5254 encoding of a string containing isolated surrogates and the 'ignore'
5255 handler is used. */
Serhiy Storchaka30793282014-01-04 22:44:01 +02005256 nsize = p - (unsigned char*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005257 if (nsize != PyBytes_GET_SIZE(v))
5258 _PyBytes_Resize(&v, nsize);
5259 Py_XDECREF(errorHandler);
5260 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005261 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005262 error:
5263 Py_XDECREF(rep);
5264 Py_XDECREF(errorHandler);
5265 Py_XDECREF(exc);
5266 Py_XDECREF(v);
5267 return NULL;
Serhiy Storchaka30793282014-01-04 22:44:01 +02005268#undef STORECHAR
Walter Dörwald41980ca2007-08-16 21:55:45 +00005269}
5270
Alexander Belopolsky40018472011-02-26 01:02:56 +00005271PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005272PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5273 Py_ssize_t size,
5274 const char *errors,
5275 int byteorder)
5276{
5277 PyObject *result;
5278 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5279 if (tmp == NULL)
5280 return NULL;
5281 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5282 Py_DECREF(tmp);
5283 return result;
5284}
5285
5286PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005287PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005288{
Victor Stinnerb960b342011-11-20 19:12:52 +01005289 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005290}
5291
Guido van Rossumd57fd912000-03-10 22:53:23 +00005292/* --- UTF-16 Codec ------------------------------------------------------- */
5293
Tim Peters772747b2001-08-09 22:21:55 +00005294PyObject *
5295PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005296 Py_ssize_t size,
5297 const char *errors,
5298 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005299{
Walter Dörwald69652032004-09-07 20:24:22 +00005300 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5301}
5302
5303PyObject *
5304PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005305 Py_ssize_t size,
5306 const char *errors,
5307 int *byteorder,
5308 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005309{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005310 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005311 Py_ssize_t startinpos;
5312 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005313 _PyUnicodeWriter writer;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005314 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005315 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005316 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005317 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005318 PyObject *errorHandler = NULL;
5319 PyObject *exc = NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005320 const char *encoding;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005321
Tim Peters772747b2001-08-09 22:21:55 +00005322 q = (unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005323 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005324
5325 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005326 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005327
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005328 /* Check for BOM marks (U+FEFF) in the input and adjust current
5329 byte order setting accordingly. In native mode, the leading BOM
5330 mark is skipped, in all other modes, it is copied to the output
5331 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005332 if (bo == 0 && size >= 2) {
5333 const Py_UCS4 bom = (q[1] << 8) | q[0];
5334 if (bom == 0xFEFF) {
5335 q += 2;
5336 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005337 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005338 else if (bom == 0xFFFE) {
5339 q += 2;
5340 bo = 1;
5341 }
5342 if (byteorder)
5343 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005344 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005345
Antoine Pitrou63065d72012-05-15 23:48:04 +02005346 if (q == e) {
5347 if (consumed)
5348 *consumed = size;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005349 _Py_RETURN_UNICODE_EMPTY();
Tim Peters772747b2001-08-09 22:21:55 +00005350 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005351
Christian Heimes743e0cd2012-10-17 23:52:17 +02005352#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02005353 native_ordering = bo <= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005354 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
Antoine Pitrouab868312009-01-10 15:40:25 +00005355#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02005356 native_ordering = bo >= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005357 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
Antoine Pitrouab868312009-01-10 15:40:25 +00005358#endif
Tim Peters772747b2001-08-09 22:21:55 +00005359
Antoine Pitrou63065d72012-05-15 23:48:04 +02005360 /* Note: size will always be longer than the resulting Unicode
5361 character count */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005362 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005363 writer.min_length = (e - q + 1) / 2;
5364 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005365 goto onError;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005366
Antoine Pitrou63065d72012-05-15 23:48:04 +02005367 while (1) {
5368 Py_UCS4 ch = 0;
5369 if (e - q >= 2) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005370 int kind = writer.kind;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005371 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005372 if (PyUnicode_IS_ASCII(writer.buffer))
Antoine Pitrou63065d72012-05-15 23:48:04 +02005373 ch = asciilib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005374 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005375 native_ordering);
5376 else
5377 ch = ucs1lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005378 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005379 native_ordering);
5380 } else if (kind == PyUnicode_2BYTE_KIND) {
5381 ch = ucs2lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005382 (Py_UCS2*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005383 native_ordering);
5384 } else {
5385 assert(kind == PyUnicode_4BYTE_KIND);
5386 ch = ucs4lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005387 (Py_UCS4*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005388 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00005389 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005390 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005391
Antoine Pitrou63065d72012-05-15 23:48:04 +02005392 switch (ch)
5393 {
5394 case 0:
5395 /* remaining byte at the end? (size should be even) */
5396 if (q == e || consumed)
5397 goto End;
5398 errmsg = "truncated data";
5399 startinpos = ((const char *)q) - starts;
5400 endinpos = ((const char *)e) - starts;
5401 break;
5402 /* The remaining input chars are ignored if the callback
5403 chooses to skip the input */
5404 case 1:
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005405 q -= 2;
5406 if (consumed)
Serhiy Storchakaae3b32a2013-01-08 23:40:52 +02005407 goto End;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005408 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005409 startinpos = ((const char *)q) - starts;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005410 endinpos = ((const char *)e) - starts;
5411 break;
5412 case 2:
5413 errmsg = "illegal encoding";
5414 startinpos = ((const char *)q) - 2 - starts;
5415 endinpos = startinpos + 2;
5416 break;
5417 case 3:
5418 errmsg = "illegal UTF-16 surrogate";
5419 startinpos = ((const char *)q) - 4 - starts;
5420 endinpos = startinpos + 2;
5421 break;
5422 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005423 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005424 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005425 continue;
5426 }
5427
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005428 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouab868312009-01-10 15:40:25 +00005429 errors,
5430 &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005431 encoding, errmsg,
Antoine Pitrouab868312009-01-10 15:40:25 +00005432 &starts,
5433 (const char **)&e,
5434 &startinpos,
5435 &endinpos,
5436 &exc,
5437 (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005438 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005439 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005440 }
5441
Antoine Pitrou63065d72012-05-15 23:48:04 +02005442End:
Walter Dörwald69652032004-09-07 20:24:22 +00005443 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005444 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005445
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005446 Py_XDECREF(errorHandler);
5447 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005448 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005449
Benjamin Peterson29060642009-01-31 22:14:21 +00005450 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005451 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005452 Py_XDECREF(errorHandler);
5453 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005454 return NULL;
5455}
5456
Tim Peters772747b2001-08-09 22:21:55 +00005457PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005458_PyUnicode_EncodeUTF16(PyObject *str,
5459 const char *errors,
5460 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005461{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005462 enum PyUnicode_Kind kind;
5463 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005464 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005465 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005466 unsigned short *out;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005467 Py_ssize_t pairs;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005468#if PY_BIG_ENDIAN
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005469 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005470#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005471 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005472#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005473 const char *encoding;
5474 Py_ssize_t nsize, pos;
5475 PyObject *errorHandler = NULL;
5476 PyObject *exc = NULL;
5477 PyObject *rep = NULL;
Tim Peters772747b2001-08-09 22:21:55 +00005478
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005479 if (!PyUnicode_Check(str)) {
5480 PyErr_BadArgument();
5481 return NULL;
5482 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005483 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005484 return NULL;
5485 kind = PyUnicode_KIND(str);
5486 data = PyUnicode_DATA(str);
5487 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005488
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005489 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005490 if (kind == PyUnicode_4BYTE_KIND) {
5491 const Py_UCS4 *in = (const Py_UCS4 *)data;
5492 const Py_UCS4 *end = in + len;
5493 while (in < end)
5494 if (*in++ >= 0x10000)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005495 pairs++;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005496 }
5497 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005498 return PyErr_NoMemory();
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005499 nsize = len + pairs + (byteorder == 0);
5500 v = PyBytes_FromStringAndSize(NULL, nsize * 2);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005501 if (v == NULL)
5502 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005503
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005504 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02005505 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005506 out = (unsigned short *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005507 if (byteorder == 0)
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005508 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005509 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005510 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005511
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005512 if (kind == PyUnicode_1BYTE_KIND) {
5513 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5514 goto done;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005515 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005516
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005517 if (byteorder < 0)
5518 encoding = "utf-16-le";
5519 else if (byteorder > 0)
5520 encoding = "utf-16-be";
5521 else
5522 encoding = "utf-16";
5523
5524 pos = 0;
5525 while (pos < len) {
5526 Py_ssize_t repsize, moreunits;
5527
5528 if (kind == PyUnicode_2BYTE_KIND) {
5529 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
5530 &out, native_ordering);
5531 }
5532 else {
5533 assert(kind == PyUnicode_4BYTE_KIND);
5534 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
5535 &out, native_ordering);
5536 }
5537 if (pos == len)
5538 break;
5539
5540 rep = unicode_encode_call_errorhandler(
5541 errors, &errorHandler,
5542 encoding, "surrogates not allowed",
5543 str, &exc, pos, pos + 1, &pos);
5544 if (!rep)
5545 goto error;
5546
5547 if (PyBytes_Check(rep)) {
5548 repsize = PyBytes_GET_SIZE(rep);
5549 if (repsize & 1) {
5550 raise_encode_exception(&exc, encoding,
5551 str, pos - 1, pos,
5552 "surrogates not allowed");
5553 goto error;
5554 }
5555 moreunits = repsize / 2;
5556 }
5557 else {
5558 assert(PyUnicode_Check(rep));
5559 if (PyUnicode_READY(rep) < 0)
5560 goto error;
5561 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5562 if (!PyUnicode_IS_ASCII(rep)) {
5563 raise_encode_exception(&exc, encoding,
5564 str, pos - 1, pos,
5565 "surrogates not allowed");
5566 goto error;
5567 }
5568 }
5569
5570 /* two bytes are reserved for each surrogate */
5571 if (moreunits > 1) {
5572 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
5573 Py_ssize_t morebytes = 2 * (moreunits - 1);
5574 if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) {
5575 /* integer overflow */
5576 PyErr_NoMemory();
5577 goto error;
5578 }
5579 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0)
5580 goto error;
5581 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
5582 }
5583
5584 if (PyBytes_Check(rep)) {
5585 Py_MEMCPY(out, PyBytes_AS_STRING(rep), repsize);
5586 out += moreunits;
5587 } else /* rep is unicode */ {
5588 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
5589 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5590 &out, native_ordering);
5591 }
5592
5593 Py_CLEAR(rep);
5594 }
5595
5596 /* Cut back to size actually needed. This is necessary for, for example,
5597 encoding of a string containing isolated surrogates and the 'ignore' handler
5598 is used. */
5599 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
5600 if (nsize != PyBytes_GET_SIZE(v))
5601 _PyBytes_Resize(&v, nsize);
5602 Py_XDECREF(errorHandler);
5603 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00005604 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005605 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005606 error:
5607 Py_XDECREF(rep);
5608 Py_XDECREF(errorHandler);
5609 Py_XDECREF(exc);
5610 Py_XDECREF(v);
5611 return NULL;
5612#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005613}
5614
Alexander Belopolsky40018472011-02-26 01:02:56 +00005615PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005616PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5617 Py_ssize_t size,
5618 const char *errors,
5619 int byteorder)
5620{
5621 PyObject *result;
5622 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5623 if (tmp == NULL)
5624 return NULL;
5625 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5626 Py_DECREF(tmp);
5627 return result;
5628}
5629
5630PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005631PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005632{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005633 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005634}
5635
5636/* --- Unicode Escape Codec ----------------------------------------------- */
5637
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005638/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5639 if all the escapes in the string make it still a valid ASCII string.
5640 Returns -1 if any escapes were found which cause the string to
5641 pop out of ASCII range. Otherwise returns the length of the
5642 required buffer to hold the string.
5643 */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02005644static Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005645length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5646{
5647 const unsigned char *p = (const unsigned char *)s;
5648 const unsigned char *end = p + size;
5649 Py_ssize_t length = 0;
5650
5651 if (size < 0)
5652 return -1;
5653
5654 for (; p < end; ++p) {
5655 if (*p > 127) {
5656 /* Non-ASCII */
5657 return -1;
5658 }
5659 else if (*p != '\\') {
5660 /* Normal character */
5661 ++length;
5662 }
5663 else {
5664 /* Backslash-escape, check next char */
5665 ++p;
5666 /* Escape sequence reaches till end of string or
5667 non-ASCII follow-up. */
5668 if (p >= end || *p > 127)
5669 return -1;
5670 switch (*p) {
5671 case '\n':
5672 /* backslash + \n result in zero characters */
5673 break;
5674 case '\\': case '\'': case '\"':
5675 case 'b': case 'f': case 't':
5676 case 'n': case 'r': case 'v': case 'a':
5677 ++length;
5678 break;
5679 case '0': case '1': case '2': case '3':
5680 case '4': case '5': case '6': case '7':
5681 case 'x': case 'u': case 'U': case 'N':
5682 /* these do not guarantee ASCII characters */
5683 return -1;
5684 default:
5685 /* count the backslash + the other character */
5686 length += 2;
5687 }
5688 }
5689 }
5690 return length;
5691}
5692
Fredrik Lundh06d12682001-01-24 07:59:11 +00005693static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005694
Alexander Belopolsky40018472011-02-26 01:02:56 +00005695PyObject *
5696PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005697 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005698 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005699{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005700 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005701 Py_ssize_t startinpos;
5702 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005703 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005704 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005705 char* message;
5706 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005707 PyObject *errorHandler = NULL;
5708 PyObject *exc = NULL;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005709 Py_ssize_t len;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005710
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005711 len = length_of_escaped_ascii_string(s, size);
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005712 if (len == 0)
5713 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005714
5715 /* After length_of_escaped_ascii_string() there are two alternatives,
5716 either the string is pure ASCII with named escapes like \n, etc.
5717 and we determined it's exact size (common case)
5718 or it contains \x, \u, ... escape sequences. then we create a
5719 legacy wchar string and resize it at the end of this function. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005720 _PyUnicodeWriter_Init(&writer);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005721 if (len > 0) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02005722 writer.min_length = len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005723 }
5724 else {
5725 /* Escaped strings will always be longer than the resulting
5726 Unicode string, so we start with size here and then reduce the
5727 length after conversion to the true value.
5728 (but if the error callback returns a long replacement string
5729 we'll have to allocate more space) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005730 writer.min_length = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005731 }
5732
Guido van Rossumd57fd912000-03-10 22:53:23 +00005733 if (size == 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005734 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005735 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005736
Guido van Rossumd57fd912000-03-10 22:53:23 +00005737 while (s < end) {
5738 unsigned char c;
Victor Stinner24729f32011-11-10 20:31:37 +01005739 Py_UCS4 x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005740 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005741
5742 /* Non-escape characters are interpreted as Unicode ordinals */
5743 if (*s != '\\') {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005744 x = (unsigned char)*s;
5745 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005746 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005747 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005748 continue;
5749 }
5750
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005751 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005752 /* \ - Escapes */
5753 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005754 c = *s++;
5755 if (s > end)
5756 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005757
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005758 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005759
Benjamin Peterson29060642009-01-31 22:14:21 +00005760 /* \x escapes */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005761#define WRITECHAR(ch) \
5762 do { \
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005763 if (_PyUnicodeWriter_WriteCharInline(&writer, (ch)) < 0) \
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005764 goto onError; \
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005765 } while(0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005766
Guido van Rossumd57fd912000-03-10 22:53:23 +00005767 case '\n': break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005768 case '\\': WRITECHAR('\\'); break;
5769 case '\'': WRITECHAR('\''); break;
5770 case '\"': WRITECHAR('\"'); break;
5771 case 'b': WRITECHAR('\b'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005772 /* FF */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005773 case 'f': WRITECHAR('\014'); break;
5774 case 't': WRITECHAR('\t'); break;
5775 case 'n': WRITECHAR('\n'); break;
5776 case 'r': WRITECHAR('\r'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005777 /* VT */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005778 case 'v': WRITECHAR('\013'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005779 /* BEL, not classic C */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005780 case 'a': WRITECHAR('\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005781
Benjamin Peterson29060642009-01-31 22:14:21 +00005782 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005783 case '0': case '1': case '2': case '3':
5784 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005785 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005786 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005787 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005788 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005789 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005790 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005791 WRITECHAR(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005792 break;
5793
Benjamin Peterson29060642009-01-31 22:14:21 +00005794 /* hex escapes */
5795 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005796 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005797 digits = 2;
5798 message = "truncated \\xXX escape";
5799 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005800
Benjamin Peterson29060642009-01-31 22:14:21 +00005801 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005802 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005803 digits = 4;
5804 message = "truncated \\uXXXX escape";
5805 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005806
Benjamin Peterson29060642009-01-31 22:14:21 +00005807 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005808 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005809 digits = 8;
5810 message = "truncated \\UXXXXXXXX escape";
5811 hexescape:
5812 chr = 0;
Serhiy Storchakad6793772013-01-29 10:20:44 +02005813 if (end - s < digits) {
5814 /* count only hex digits */
5815 for (; s < end; ++s) {
5816 c = (unsigned char)*s;
5817 if (!Py_ISXDIGIT(c))
5818 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005819 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02005820 goto error;
5821 }
5822 for (; digits--; ++s) {
5823 c = (unsigned char)*s;
5824 if (!Py_ISXDIGIT(c))
5825 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005826 chr = (chr<<4) & ~0xF;
5827 if (c >= '0' && c <= '9')
5828 chr += c - '0';
5829 else if (c >= 'a' && c <= 'f')
5830 chr += 10 + c - 'a';
5831 else
5832 chr += 10 + c - 'A';
5833 }
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005834 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005835 /* _decoding_error will have already written into the
5836 target buffer. */
5837 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005838 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005839 /* when we get here, chr is a 32-bit unicode character */
Serhiy Storchaka24193de2013-01-29 10:28:07 +02005840 message = "illegal Unicode character";
5841 if (chr > MAX_UNICODE)
Serhiy Storchakad6793772013-01-29 10:20:44 +02005842 goto error;
Serhiy Storchaka24193de2013-01-29 10:28:07 +02005843 WRITECHAR(chr);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005844 break;
5845
Benjamin Peterson29060642009-01-31 22:14:21 +00005846 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005847 case 'N':
5848 message = "malformed \\N character escape";
5849 if (ucnhash_CAPI == NULL) {
5850 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005851 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5852 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005853 if (ucnhash_CAPI == NULL)
5854 goto ucnhashError;
5855 }
5856 if (*s == '{') {
5857 const char *start = s+1;
5858 /* look for the closing brace */
5859 while (*s != '}' && s < end)
5860 s++;
5861 if (s > start && s < end && *s == '}') {
5862 /* found a name. look it up in the unicode database */
5863 message = "unknown Unicode character name";
5864 s++;
Serhiy Storchaka4f5f0e52013-01-21 11:38:00 +02005865 if (s - start - 1 <= INT_MAX &&
Serhiy Storchakac35f3a92013-01-21 11:42:57 +02005866 ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
Ezio Melotti931b8aa2011-10-21 21:57:36 +03005867 &chr, 0))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005868 goto store;
5869 }
5870 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02005871 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005872
5873 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005874 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005875 message = "\\ at end of string";
5876 s--;
Serhiy Storchakad6793772013-01-29 10:20:44 +02005877 goto error;
Walter Dörwald8c077222002-03-25 11:16:18 +00005878 }
5879 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005880 WRITECHAR('\\');
Serhiy Storchaka73e38802013-01-25 23:52:21 +02005881 WRITECHAR((unsigned char)s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005882 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005883 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005884 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02005885 continue;
5886
5887 error:
5888 endinpos = s-starts;
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02005889 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchakad6793772013-01-29 10:20:44 +02005890 errors, &errorHandler,
5891 "unicodeescape", message,
5892 &starts, &end, &startinpos, &endinpos, &exc, &s,
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02005893 &writer))
Serhiy Storchakad6793772013-01-29 10:20:44 +02005894 goto onError;
5895 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005896 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005897#undef WRITECHAR
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005898
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005899 Py_XDECREF(errorHandler);
5900 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005901 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald8c077222002-03-25 11:16:18 +00005902
Benjamin Peterson29060642009-01-31 22:14:21 +00005903 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005904 PyErr_SetString(
5905 PyExc_UnicodeError,
5906 "\\N escapes not supported (can't load unicodedata module)"
5907 );
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005908 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005909 Py_XDECREF(errorHandler);
5910 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005911 return NULL;
5912
Benjamin Peterson29060642009-01-31 22:14:21 +00005913 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005914 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005915 Py_XDECREF(errorHandler);
5916 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005917 return NULL;
5918}
5919
5920/* Return a Unicode-Escape string version of the Unicode object.
5921
5922 If quotes is true, the string is enclosed in u"" or u'' quotes as
5923 appropriate.
5924
5925*/
5926
Alexander Belopolsky40018472011-02-26 01:02:56 +00005927PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005928PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005929{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005930 Py_ssize_t i, len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005931 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005932 char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005933 int kind;
5934 void *data;
5935 Py_ssize_t expandsize = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005936
Ezio Melottie7f90372012-10-05 03:33:31 +03005937 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00005938 escape.
5939
Ezio Melottie7f90372012-10-05 03:33:31 +03005940 For UCS1 strings it's '\xxx', 4 bytes per source character.
5941 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
5942 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00005943 */
5944
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005945 if (!PyUnicode_Check(unicode)) {
5946 PyErr_BadArgument();
5947 return NULL;
5948 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005949 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005950 return NULL;
5951 len = PyUnicode_GET_LENGTH(unicode);
5952 kind = PyUnicode_KIND(unicode);
5953 data = PyUnicode_DATA(unicode);
Benjamin Petersonead6b532011-12-20 17:23:42 -06005954 switch (kind) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005955 case PyUnicode_1BYTE_KIND: expandsize = 4; break;
5956 case PyUnicode_2BYTE_KIND: expandsize = 6; break;
5957 case PyUnicode_4BYTE_KIND: expandsize = 10; break;
5958 }
5959
5960 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005961 return PyBytes_FromStringAndSize(NULL, 0);
5962
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005963 if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005964 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005965
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005966 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005967 2
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005968 + expandsize*len
Benjamin Peterson29060642009-01-31 22:14:21 +00005969 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005970 if (repr == NULL)
5971 return NULL;
5972
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005973 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005974
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005975 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01005976 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005977
Walter Dörwald79e913e2007-05-12 11:08:06 +00005978 /* Escape backslashes */
5979 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005980 *p++ = '\\';
5981 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005982 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005983 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005984
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005985 /* Map 21-bit characters to '\U00xxxxxx' */
5986 else if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01005987 assert(ch <= MAX_UNICODE);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005988 *p++ = '\\';
5989 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005990 *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
5991 *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F];
5992 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
5993 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
5994 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
5995 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
5996 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
5997 *p++ = Py_hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00005998 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005999 }
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00006000
Guido van Rossumd57fd912000-03-10 22:53:23 +00006001 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00006002 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006003 *p++ = '\\';
6004 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006005 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6006 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6007 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6008 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006009 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006010
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006011 /* Map special whitespace to '\t', \n', '\r' */
6012 else if (ch == '\t') {
6013 *p++ = '\\';
6014 *p++ = 't';
6015 }
6016 else if (ch == '\n') {
6017 *p++ = '\\';
6018 *p++ = 'n';
6019 }
6020 else if (ch == '\r') {
6021 *p++ = '\\';
6022 *p++ = 'r';
6023 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006024
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006025 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00006026 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006027 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006028 *p++ = 'x';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006029 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6030 *p++ = Py_hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00006031 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006032
Guido van Rossumd57fd912000-03-10 22:53:23 +00006033 /* Copy everything else as-is */
6034 else
6035 *p++ = (char) ch;
6036 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006037
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006038 assert(p - PyBytes_AS_STRING(repr) > 0);
6039 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
6040 return NULL;
6041 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006042}
6043
Alexander Belopolsky40018472011-02-26 01:02:56 +00006044PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006045PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6046 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006047{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006048 PyObject *result;
6049 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6050 if (tmp == NULL)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006051 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006052 result = PyUnicode_AsUnicodeEscapeString(tmp);
6053 Py_DECREF(tmp);
6054 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006055}
6056
6057/* --- Raw Unicode Escape Codec ------------------------------------------- */
6058
Alexander Belopolsky40018472011-02-26 01:02:56 +00006059PyObject *
6060PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006061 Py_ssize_t size,
6062 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006063{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006064 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006065 Py_ssize_t startinpos;
6066 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006067 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006068 const char *end;
6069 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006070 PyObject *errorHandler = NULL;
6071 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006072
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006073 if (size == 0)
6074 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006075
Guido van Rossumd57fd912000-03-10 22:53:23 +00006076 /* Escaped strings will always be longer than the resulting
6077 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006078 length after conversion to the true value. (But decoding error
6079 handler might have to resize the string) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006080 _PyUnicodeWriter_Init(&writer);
6081 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006082
Guido van Rossumd57fd912000-03-10 22:53:23 +00006083 end = s + size;
6084 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006085 unsigned char c;
6086 Py_UCS4 x;
6087 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006088 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006089
Benjamin Peterson29060642009-01-31 22:14:21 +00006090 /* Non-escape characters are interpreted as Unicode ordinals */
6091 if (*s != '\\') {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006092 x = (unsigned char)*s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006093 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006094 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006095 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006096 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006097 startinpos = s-starts;
6098
6099 /* \u-escapes are only interpreted iff the number of leading
6100 backslashes if odd */
6101 bs = s;
6102 for (;s < end;) {
6103 if (*s != '\\')
6104 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006105 x = (unsigned char)*s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006106 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006107 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006108 }
6109 if (((s - bs) & 1) == 0 ||
6110 s >= end ||
6111 (*s != 'u' && *s != 'U')) {
6112 continue;
6113 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006114 writer.pos--;
Benjamin Peterson29060642009-01-31 22:14:21 +00006115 count = *s=='u' ? 4 : 8;
6116 s++;
6117
6118 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Benjamin Peterson29060642009-01-31 22:14:21 +00006119 for (x = 0, i = 0; i < count; ++i, ++s) {
6120 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00006121 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006122 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006123 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00006124 errors, &errorHandler,
6125 "rawunicodeescape", "truncated \\uXXXX",
6126 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006127 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006128 goto onError;
6129 goto nextByte;
6130 }
6131 x = (x<<4) & ~0xF;
6132 if (c >= '0' && c <= '9')
6133 x += c - '0';
6134 else if (c >= 'a' && c <= 'f')
6135 x += 10 + c - 'a';
6136 else
6137 x += 10 + c - 'A';
6138 }
Victor Stinner8faf8212011-12-08 22:14:11 +01006139 if (x <= MAX_UNICODE) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006140 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006141 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006142 }
6143 else {
Christian Heimesfe337bf2008-03-23 21:54:12 +00006144 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006145 if (unicode_decode_call_errorhandler_writer(
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006146 errors, &errorHandler,
6147 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00006148 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006149 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006150 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006151 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006152 nextByte:
6153 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006154 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006155 Py_XDECREF(errorHandler);
6156 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006157 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006158
Benjamin Peterson29060642009-01-31 22:14:21 +00006159 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006160 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006161 Py_XDECREF(errorHandler);
6162 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006163 return NULL;
6164}
6165
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006166
Alexander Belopolsky40018472011-02-26 01:02:56 +00006167PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006168PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006169{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006170 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006171 char *p;
6172 char *q;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006173 Py_ssize_t expandsize, pos;
6174 int kind;
6175 void *data;
6176 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006177
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006178 if (!PyUnicode_Check(unicode)) {
6179 PyErr_BadArgument();
6180 return NULL;
6181 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05006182 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006183 return NULL;
6184 kind = PyUnicode_KIND(unicode);
6185 data = PyUnicode_DATA(unicode);
6186 len = PyUnicode_GET_LENGTH(unicode);
Benjamin Peterson1518e872011-11-23 10:44:52 -06006187 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6188 bytes, and 1 byte characters 4. */
6189 expandsize = kind * 2 + 2;
Victor Stinner0e368262011-11-10 20:12:49 +01006190
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006191 if (len > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006192 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00006193
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006194 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006195 if (repr == NULL)
6196 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006197 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006198 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006199
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006200 p = q = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006201 for (pos = 0; pos < len; pos++) {
6202 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006203 /* Map 32-bit characters to '\Uxxxxxxxx' */
6204 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01006205 assert(ch <= MAX_UNICODE);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006206 *p++ = '\\';
6207 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006208 *p++ = Py_hexdigits[(ch >> 28) & 0xf];
6209 *p++ = Py_hexdigits[(ch >> 24) & 0xf];
6210 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6211 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6212 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6213 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6214 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6215 *p++ = Py_hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00006216 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006217 /* Map 16-bit characters to '\uxxxx' */
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006218 else if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006219 *p++ = '\\';
6220 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006221 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6222 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6223 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6224 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006225 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006226 /* Copy everything else as-is */
6227 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00006228 *p++ = (char) ch;
6229 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006230
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006231 assert(p > q);
6232 if (_PyBytes_Resize(&repr, p - q) < 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006233 return NULL;
6234 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006235}
6236
Alexander Belopolsky40018472011-02-26 01:02:56 +00006237PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006238PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6239 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006240{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006241 PyObject *result;
6242 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6243 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006244 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006245 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6246 Py_DECREF(tmp);
6247 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006248}
6249
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006250/* --- Unicode Internal Codec ------------------------------------------- */
6251
Alexander Belopolsky40018472011-02-26 01:02:56 +00006252PyObject *
6253_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006254 Py_ssize_t size,
6255 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006256{
6257 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006258 Py_ssize_t startinpos;
6259 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006260 _PyUnicodeWriter writer;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006261 const char *end;
6262 const char *reason;
6263 PyObject *errorHandler = NULL;
6264 PyObject *exc = NULL;
6265
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006266 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02006267 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006268 1))
6269 return NULL;
6270
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006271 if (size == 0)
6272 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006273
Victor Stinner8f674cc2013-04-17 23:02:17 +02006274 _PyUnicodeWriter_Init(&writer);
6275 if (size / Py_UNICODE_SIZE > PY_SSIZE_T_MAX - 1) {
6276 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00006277 goto onError;
Victor Stinner8f674cc2013-04-17 23:02:17 +02006278 }
6279 writer.min_length = (size + (Py_UNICODE_SIZE - 1)) / Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006280
Victor Stinner8f674cc2013-04-17 23:02:17 +02006281 end = s + size;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006282 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006283 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006284 Py_UCS4 ch;
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006285 if (end - s < Py_UNICODE_SIZE) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006286 endinpos = end-starts;
6287 reason = "truncated input";
6288 goto error;
6289 }
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006290 /* We copy the raw representation one byte at a time because the
6291 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006292 ((char *) &uch)[0] = s[0];
6293 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006294#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006295 ((char *) &uch)[2] = s[2];
6296 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006297#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006298 ch = uch;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006299#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006300 /* We have to sanity check the raw data, otherwise doom looms for
6301 some malformed UCS-4 data. */
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006302 if (ch > 0x10ffff) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006303 endinpos = s - starts + Py_UNICODE_SIZE;
6304 reason = "illegal code point (> 0x10FFFF)";
6305 goto error;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006306 }
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006307#endif
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006308 s += Py_UNICODE_SIZE;
6309#ifndef Py_UNICODE_WIDE
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006310 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && end - s >= Py_UNICODE_SIZE)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006311 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006312 Py_UNICODE uch2;
6313 ((char *) &uch2)[0] = s[0];
6314 ((char *) &uch2)[1] = s[1];
Victor Stinner551ac952011-11-29 22:58:13 +01006315 if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006316 {
Victor Stinner551ac952011-11-29 22:58:13 +01006317 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006318 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006319 }
6320 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006321#endif
6322
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006323 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006324 goto onError;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006325 continue;
6326
6327 error:
6328 startinpos = s - starts;
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006329 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006330 errors, &errorHandler,
6331 "unicode_internal", reason,
6332 &starts, &end, &startinpos, &endinpos, &exc, &s,
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006333 &writer))
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006334 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006335 }
6336
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006337 Py_XDECREF(errorHandler);
6338 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006339 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006340
Benjamin Peterson29060642009-01-31 22:14:21 +00006341 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006342 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006343 Py_XDECREF(errorHandler);
6344 Py_XDECREF(exc);
6345 return NULL;
6346}
6347
Guido van Rossumd57fd912000-03-10 22:53:23 +00006348/* --- Latin-1 Codec ------------------------------------------------------ */
6349
Alexander Belopolsky40018472011-02-26 01:02:56 +00006350PyObject *
6351PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006352 Py_ssize_t size,
6353 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006354{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006355 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006356 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006357}
6358
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006359/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006360static void
6361make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006362 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006363 PyObject *unicode,
6364 Py_ssize_t startpos, Py_ssize_t endpos,
6365 const char *reason)
6366{
6367 if (*exceptionObject == NULL) {
6368 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006369 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006370 encoding, unicode, startpos, endpos, reason);
6371 }
6372 else {
6373 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6374 goto onError;
6375 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6376 goto onError;
6377 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6378 goto onError;
6379 return;
6380 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02006381 Py_CLEAR(*exceptionObject);
Martin v. Löwis9e816682011-11-02 12:45:42 +01006382 }
6383}
6384
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006385/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006386static void
6387raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006388 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006389 PyObject *unicode,
6390 Py_ssize_t startpos, Py_ssize_t endpos,
6391 const char *reason)
6392{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006393 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006394 encoding, unicode, startpos, endpos, reason);
6395 if (*exceptionObject != NULL)
6396 PyCodec_StrictErrors(*exceptionObject);
6397}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006398
6399/* error handling callback helper:
6400 build arguments, call the callback and check the arguments,
6401 put the result into newpos and return the replacement string, which
6402 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006403static PyObject *
6404unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006405 PyObject **errorHandler,
6406 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006407 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006408 Py_ssize_t startpos, Py_ssize_t endpos,
6409 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006410{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006411 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006412 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006413 PyObject *restuple;
6414 PyObject *resunicode;
6415
6416 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006417 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006418 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006419 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006420 }
6421
Benjamin Petersonbac79492012-01-14 13:34:47 -05006422 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006423 return NULL;
6424 len = PyUnicode_GET_LENGTH(unicode);
6425
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006426 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006427 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006428 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006429 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006430
6431 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006432 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006433 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006434 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006435 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006436 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006437 Py_DECREF(restuple);
6438 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006439 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006440 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006441 &resunicode, newpos)) {
6442 Py_DECREF(restuple);
6443 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006444 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006445 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6446 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6447 Py_DECREF(restuple);
6448 return NULL;
6449 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006450 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006451 *newpos = len + *newpos;
6452 if (*newpos<0 || *newpos>len) {
Victor Stinnera33bce02014-07-04 22:47:46 +02006453 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006454 Py_DECREF(restuple);
6455 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006456 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006457 Py_INCREF(resunicode);
6458 Py_DECREF(restuple);
6459 return resunicode;
6460}
6461
Alexander Belopolsky40018472011-02-26 01:02:56 +00006462static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006463unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006464 const char *errors,
Victor Stinnerfcd96532011-11-04 00:28:50 +01006465 unsigned int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006466{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006467 /* input state */
6468 Py_ssize_t pos=0, size;
6469 int kind;
6470 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006471 /* output object */
6472 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006473 /* pointer into the output */
6474 char *str;
6475 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006476 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006477 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6478 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006479 PyObject *errorHandler = NULL;
6480 PyObject *exc = NULL;
6481 /* the following variable is used for caching string comparisons
6482 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6483 int known_errorHandler = -1;
6484
Benjamin Petersonbac79492012-01-14 13:34:47 -05006485 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006486 return NULL;
6487 size = PyUnicode_GET_LENGTH(unicode);
6488 kind = PyUnicode_KIND(unicode);
6489 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006490 /* allocate enough for a simple encoding without
6491 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006492 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006493 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006494 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006495 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006496 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006497 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006498 ressize = size;
6499
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006500 while (pos < size) {
6501 Py_UCS4 c = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006502
Benjamin Peterson29060642009-01-31 22:14:21 +00006503 /* can we encode this? */
6504 if (c<limit) {
6505 /* no overflow check, because we know that the space is enough */
6506 *str++ = (char)c;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006507 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006508 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006509 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006510 Py_ssize_t requiredsize;
6511 PyObject *repunicode;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006512 Py_ssize_t repsize, newpos, respos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006513 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006514 Py_ssize_t collstart = pos;
6515 Py_ssize_t collend = pos;
Benjamin Peterson29060642009-01-31 22:14:21 +00006516 /* find all unecodable characters */
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006517 while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006518 ++collend;
6519 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6520 if (known_errorHandler==-1) {
6521 if ((errors==NULL) || (!strcmp(errors, "strict")))
6522 known_errorHandler = 1;
6523 else if (!strcmp(errors, "replace"))
6524 known_errorHandler = 2;
6525 else if (!strcmp(errors, "ignore"))
6526 known_errorHandler = 3;
6527 else if (!strcmp(errors, "xmlcharrefreplace"))
6528 known_errorHandler = 4;
6529 else
6530 known_errorHandler = 0;
6531 }
6532 switch (known_errorHandler) {
6533 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006534 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006535 goto onError;
6536 case 2: /* replace */
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006537 while (collstart++ < collend)
Benjamin Peterson29060642009-01-31 22:14:21 +00006538 *str++ = '?'; /* fall through */
6539 case 3: /* ignore */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006540 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006541 break;
6542 case 4: /* xmlcharrefreplace */
6543 respos = str - PyBytes_AS_STRING(res);
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006544 requiredsize = respos;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006545 /* determine replacement size */
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006546 for (i = collstart; i < collend; ++i) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006547 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006548 Py_ssize_t incr;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006549 if (ch < 10)
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006550 incr = 2+1+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006551 else if (ch < 100)
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006552 incr = 2+2+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006553 else if (ch < 1000)
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006554 incr = 2+3+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006555 else if (ch < 10000)
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006556 incr = 2+4+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006557 else if (ch < 100000)
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006558 incr = 2+5+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006559 else if (ch < 1000000)
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006560 incr = 2+6+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006561 else {
Victor Stinner8faf8212011-12-08 22:14:11 +01006562 assert(ch <= MAX_UNICODE);
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006563 incr = 2+7+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006564 }
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006565 if (requiredsize > PY_SSIZE_T_MAX - incr)
6566 goto overflow;
6567 requiredsize += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +00006568 }
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006569 if (requiredsize > PY_SSIZE_T_MAX - (size - collend))
6570 goto overflow;
6571 requiredsize += size - collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006572 if (requiredsize > ressize) {
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006573 if (ressize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*ressize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006574 requiredsize = 2*ressize;
6575 if (_PyBytes_Resize(&res, requiredsize))
6576 goto onError;
6577 str = PyBytes_AS_STRING(res) + respos;
6578 ressize = requiredsize;
6579 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006580 /* generate replacement */
6581 for (i = collstart; i < collend; ++i) {
6582 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
Benjamin Peterson29060642009-01-31 22:14:21 +00006583 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006584 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006585 break;
6586 default:
6587 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006588 encoding, reason, unicode, &exc,
6589 collstart, collend, &newpos);
6590 if (repunicode == NULL || (PyUnicode_Check(repunicode) &&
Benjamin Petersonbac79492012-01-14 13:34:47 -05006591 PyUnicode_READY(repunicode) == -1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006592 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006593 if (PyBytes_Check(repunicode)) {
6594 /* Directly copy bytes result to output. */
6595 repsize = PyBytes_Size(repunicode);
6596 if (repsize > 1) {
6597 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006598 respos = str - PyBytes_AS_STRING(res);
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006599 if (ressize > PY_SSIZE_T_MAX - repsize - 1) {
6600 Py_DECREF(repunicode);
6601 goto overflow;
6602 }
Martin v. Löwis011e8422009-05-05 04:43:17 +00006603 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6604 Py_DECREF(repunicode);
6605 goto onError;
6606 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006607 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006608 ressize += repsize-1;
6609 }
6610 memcpy(str, PyBytes_AsString(repunicode), repsize);
6611 str += repsize;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006612 pos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006613 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006614 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006615 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006616 /* need more space? (at least enough for what we
6617 have+the replacement+the rest of the string, so
6618 we won't have to check space for encodable characters) */
6619 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006620 repsize = PyUnicode_GET_LENGTH(repunicode);
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006621 requiredsize = respos;
6622 if (requiredsize > PY_SSIZE_T_MAX - repsize)
6623 goto overflow;
6624 requiredsize += repsize;
6625 if (requiredsize > PY_SSIZE_T_MAX - (size - collend))
6626 goto overflow;
6627 requiredsize += size - collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006628 if (requiredsize > ressize) {
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006629 if (ressize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*ressize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006630 requiredsize = 2*ressize;
6631 if (_PyBytes_Resize(&res, requiredsize)) {
6632 Py_DECREF(repunicode);
6633 goto onError;
6634 }
6635 str = PyBytes_AS_STRING(res) + respos;
6636 ressize = requiredsize;
6637 }
6638 /* check if there is anything unencodable in the replacement
6639 and copy it to the output */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006640 for (i = 0; repsize-->0; ++i, ++str) {
6641 c = PyUnicode_READ_CHAR(repunicode, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00006642 if (c >= limit) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006643 raise_encode_exception(&exc, encoding, unicode,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006644 pos, pos+1, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006645 Py_DECREF(repunicode);
6646 goto onError;
6647 }
6648 *str = (char)c;
6649 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006650 pos = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006651 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006652 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006653 }
6654 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006655 /* Resize if we allocated to much */
6656 size = str - PyBytes_AS_STRING(res);
6657 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006658 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006659 if (_PyBytes_Resize(&res, size) < 0)
6660 goto onError;
6661 }
6662
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006663 Py_XDECREF(errorHandler);
6664 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006665 return res;
6666
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006667 overflow:
6668 PyErr_SetString(PyExc_OverflowError,
6669 "encoded result is too long for a Python string");
6670
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006671 onError:
6672 Py_XDECREF(res);
6673 Py_XDECREF(errorHandler);
6674 Py_XDECREF(exc);
6675 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006676}
6677
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006678/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006679PyObject *
6680PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006681 Py_ssize_t size,
6682 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006683{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006684 PyObject *result;
6685 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6686 if (unicode == NULL)
6687 return NULL;
6688 result = unicode_encode_ucs1(unicode, errors, 256);
6689 Py_DECREF(unicode);
6690 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006691}
6692
Alexander Belopolsky40018472011-02-26 01:02:56 +00006693PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006694_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006695{
6696 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006697 PyErr_BadArgument();
6698 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006699 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006700 if (PyUnicode_READY(unicode) == -1)
6701 return NULL;
6702 /* Fast path: if it is a one-byte string, construct
6703 bytes object directly. */
6704 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6705 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6706 PyUnicode_GET_LENGTH(unicode));
6707 /* Non-Latin-1 characters present. Defer to above function to
6708 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006709 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006710}
6711
6712PyObject*
6713PyUnicode_AsLatin1String(PyObject *unicode)
6714{
6715 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006716}
6717
6718/* --- 7-bit ASCII Codec -------------------------------------------------- */
6719
Alexander Belopolsky40018472011-02-26 01:02:56 +00006720PyObject *
6721PyUnicode_DecodeASCII(const char *s,
6722 Py_ssize_t size,
6723 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006724{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006725 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006726 _PyUnicodeWriter writer;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006727 int kind;
6728 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006729 Py_ssize_t startinpos;
6730 Py_ssize_t endinpos;
6731 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006732 const char *e;
6733 PyObject *errorHandler = NULL;
6734 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006735
Guido van Rossumd57fd912000-03-10 22:53:23 +00006736 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02006737 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006738
Guido van Rossumd57fd912000-03-10 22:53:23 +00006739 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006740 if (size == 1 && (unsigned char)s[0] < 128)
6741 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006742
Victor Stinner8f674cc2013-04-17 23:02:17 +02006743 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02006744 writer.min_length = size;
6745 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0)
Victor Stinner8f674cc2013-04-17 23:02:17 +02006746 return NULL;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006747
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006748 e = s + size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006749 data = writer.data;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006750 outpos = ascii_decode(s, e, (Py_UCS1 *)data);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006751 writer.pos = outpos;
6752 if (writer.pos == size)
6753 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006754
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006755 s += writer.pos;
6756 kind = writer.kind;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006757 while (s < e) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02006758 unsigned char c = (unsigned char)*s;
Benjamin Peterson29060642009-01-31 22:14:21 +00006759 if (c < 128) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006760 PyUnicode_WRITE(kind, data, writer.pos, c);
6761 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00006762 ++s;
6763 }
6764 else {
6765 startinpos = s-starts;
6766 endinpos = startinpos + 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006767 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00006768 errors, &errorHandler,
6769 "ascii", "ordinal not in range(128)",
6770 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006771 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006772 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006773 kind = writer.kind;
6774 data = writer.data;
Benjamin Peterson29060642009-01-31 22:14:21 +00006775 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006776 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006777 Py_XDECREF(errorHandler);
6778 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006779 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006780
Benjamin Peterson29060642009-01-31 22:14:21 +00006781 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006782 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006783 Py_XDECREF(errorHandler);
6784 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006785 return NULL;
6786}
6787
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006788/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006789PyObject *
6790PyUnicode_EncodeASCII(const Py_UNICODE *p,
6791 Py_ssize_t size,
6792 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006793{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006794 PyObject *result;
6795 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6796 if (unicode == NULL)
6797 return NULL;
6798 result = unicode_encode_ucs1(unicode, errors, 128);
6799 Py_DECREF(unicode);
6800 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006801}
6802
Alexander Belopolsky40018472011-02-26 01:02:56 +00006803PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006804_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006805{
6806 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006807 PyErr_BadArgument();
6808 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006809 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006810 if (PyUnicode_READY(unicode) == -1)
6811 return NULL;
6812 /* Fast path: if it is an ASCII-only string, construct bytes object
6813 directly. Else defer to above function to raise the exception. */
Victor Stinneraf037572013-04-14 18:44:10 +02006814 if (PyUnicode_IS_ASCII(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006815 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6816 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006817 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006818}
6819
6820PyObject *
6821PyUnicode_AsASCIIString(PyObject *unicode)
6822{
6823 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006824}
6825
Victor Stinner99b95382011-07-04 14:23:54 +02006826#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006827
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006828/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006829
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006830#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006831#define NEED_RETRY
6832#endif
6833
Victor Stinner3a50e702011-10-18 21:21:00 +02006834#ifndef WC_ERR_INVALID_CHARS
6835# define WC_ERR_INVALID_CHARS 0x0080
6836#endif
6837
6838static char*
6839code_page_name(UINT code_page, PyObject **obj)
6840{
6841 *obj = NULL;
6842 if (code_page == CP_ACP)
6843 return "mbcs";
6844 if (code_page == CP_UTF7)
6845 return "CP_UTF7";
6846 if (code_page == CP_UTF8)
6847 return "CP_UTF8";
6848
6849 *obj = PyBytes_FromFormat("cp%u", code_page);
6850 if (*obj == NULL)
6851 return NULL;
6852 return PyBytes_AS_STRING(*obj);
6853}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006854
Alexander Belopolsky40018472011-02-26 01:02:56 +00006855static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006856is_dbcs_lead_byte(UINT code_page, const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006857{
6858 const char *curr = s + offset;
Victor Stinner3a50e702011-10-18 21:21:00 +02006859 const char *prev;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006860
Victor Stinner3a50e702011-10-18 21:21:00 +02006861 if (!IsDBCSLeadByteEx(code_page, *curr))
6862 return 0;
6863
6864 prev = CharPrevExA(code_page, s, curr, 0);
6865 if (prev == curr)
6866 return 1;
6867 /* FIXME: This code is limited to "true" double-byte encodings,
6868 as it assumes an incomplete character consists of a single
6869 byte. */
6870 if (curr - prev == 2)
6871 return 1;
6872 if (!IsDBCSLeadByteEx(code_page, *prev))
6873 return 1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006874 return 0;
6875}
6876
Victor Stinner3a50e702011-10-18 21:21:00 +02006877static DWORD
6878decode_code_page_flags(UINT code_page)
6879{
6880 if (code_page == CP_UTF7) {
6881 /* The CP_UTF7 decoder only supports flags=0 */
6882 return 0;
6883 }
6884 else
6885 return MB_ERR_INVALID_CHARS;
6886}
6887
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006888/*
Victor Stinner3a50e702011-10-18 21:21:00 +02006889 * Decode a byte string from a Windows code page into unicode object in strict
6890 * mode.
6891 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02006892 * Returns consumed size if succeed, returns -2 on decode error, or raise an
6893 * OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006894 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006895static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006896decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006897 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02006898 const char *in,
6899 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006900{
Victor Stinner3a50e702011-10-18 21:21:00 +02006901 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01006902 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02006903 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006904
6905 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02006906 assert(insize > 0);
6907 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
6908 if (outsize <= 0)
6909 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006910
6911 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006912 /* Create unicode object */
Victor Stinnerab595942011-12-17 04:59:06 +01006913 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01006914 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00006915 if (*v == NULL)
6916 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006917 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006918 }
6919 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006920 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02006921 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner16e6a802011-12-12 13:24:15 +01006922 if (unicode_resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006923 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006924 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006925 }
6926
6927 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02006928 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
6929 if (outsize <= 0)
6930 goto error;
6931 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00006932
Victor Stinner3a50e702011-10-18 21:21:00 +02006933error:
6934 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
6935 return -2;
6936 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00006937 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006938}
6939
Victor Stinner3a50e702011-10-18 21:21:00 +02006940/*
6941 * Decode a byte string from a code page into unicode object with an error
6942 * handler.
6943 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02006944 * Returns consumed size if succeed, or raise an OSError or
Victor Stinner3a50e702011-10-18 21:21:00 +02006945 * UnicodeDecodeError exception and returns -1 on error.
6946 */
6947static int
6948decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006949 PyObject **v,
6950 const char *in, const int size,
Victor Stinner3a50e702011-10-18 21:21:00 +02006951 const char *errors)
6952{
6953 const char *startin = in;
6954 const char *endin = in + size;
6955 const DWORD flags = decode_code_page_flags(code_page);
6956 /* Ideally, we should get reason from FormatMessage. This is the Windows
6957 2000 English version of the message. */
6958 const char *reason = "No mapping for the Unicode character exists "
6959 "in the target code page.";
6960 /* each step cannot decode more than 1 character, but a character can be
6961 represented as a surrogate pair */
6962 wchar_t buffer[2], *startout, *out;
Victor Stinner9f067f42013-06-05 00:21:31 +02006963 int insize;
6964 Py_ssize_t outsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02006965 PyObject *errorHandler = NULL;
6966 PyObject *exc = NULL;
6967 PyObject *encoding_obj = NULL;
6968 char *encoding;
6969 DWORD err;
6970 int ret = -1;
6971
6972 assert(size > 0);
6973
6974 encoding = code_page_name(code_page, &encoding_obj);
6975 if (encoding == NULL)
6976 return -1;
6977
6978 if (errors == NULL || strcmp(errors, "strict") == 0) {
6979 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
6980 UnicodeDecodeError. */
6981 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
6982 if (exc != NULL) {
6983 PyCodec_StrictErrors(exc);
6984 Py_CLEAR(exc);
6985 }
6986 goto error;
6987 }
6988
6989 if (*v == NULL) {
6990 /* Create unicode object */
6991 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6992 PyErr_NoMemory();
6993 goto error;
6994 }
Victor Stinnerab595942011-12-17 04:59:06 +01006995 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01006996 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02006997 if (*v == NULL)
6998 goto error;
6999 startout = PyUnicode_AS_UNICODE(*v);
7000 }
7001 else {
7002 /* Extend unicode object */
7003 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
7004 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7005 PyErr_NoMemory();
7006 goto error;
7007 }
Victor Stinner16e6a802011-12-12 13:24:15 +01007008 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007009 goto error;
7010 startout = PyUnicode_AS_UNICODE(*v) + n;
7011 }
7012
7013 /* Decode the byte string character per character */
7014 out = startout;
7015 while (in < endin)
7016 {
7017 /* Decode a character */
7018 insize = 1;
7019 do
7020 {
7021 outsize = MultiByteToWideChar(code_page, flags,
7022 in, insize,
7023 buffer, Py_ARRAY_LENGTH(buffer));
7024 if (outsize > 0)
7025 break;
7026 err = GetLastError();
7027 if (err != ERROR_NO_UNICODE_TRANSLATION
7028 && err != ERROR_INSUFFICIENT_BUFFER)
7029 {
7030 PyErr_SetFromWindowsErr(0);
7031 goto error;
7032 }
7033 insize++;
7034 }
7035 /* 4=maximum length of a UTF-8 sequence */
7036 while (insize <= 4 && (in + insize) <= endin);
7037
7038 if (outsize <= 0) {
7039 Py_ssize_t startinpos, endinpos, outpos;
7040
7041 startinpos = in - startin;
7042 endinpos = startinpos + 1;
7043 outpos = out - PyUnicode_AS_UNICODE(*v);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007044 if (unicode_decode_call_errorhandler_wchar(
Victor Stinner3a50e702011-10-18 21:21:00 +02007045 errors, &errorHandler,
7046 encoding, reason,
7047 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01007048 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007049 {
7050 goto error;
7051 }
Victor Stinner596a6c42011-11-09 00:02:18 +01007052 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007053 }
7054 else {
7055 in += insize;
7056 memcpy(out, buffer, outsize * sizeof(wchar_t));
7057 out += outsize;
7058 }
7059 }
7060
7061 /* write a NUL character at the end */
7062 *out = 0;
7063
7064 /* Extend unicode object */
7065 outsize = out - startout;
7066 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner16e6a802011-12-12 13:24:15 +01007067 if (unicode_resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007068 goto error;
Victor Stinner76a31a62011-11-04 00:05:13 +01007069 ret = size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007070
7071error:
7072 Py_XDECREF(encoding_obj);
7073 Py_XDECREF(errorHandler);
7074 Py_XDECREF(exc);
7075 return ret;
7076}
7077
Victor Stinner3a50e702011-10-18 21:21:00 +02007078static PyObject *
7079decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007080 const char *s, Py_ssize_t size,
7081 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007082{
Victor Stinner76a31a62011-11-04 00:05:13 +01007083 PyObject *v = NULL;
7084 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007085
Victor Stinner3a50e702011-10-18 21:21:00 +02007086 if (code_page < 0) {
7087 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7088 return NULL;
7089 }
7090
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007091 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007092 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007093
Victor Stinner76a31a62011-11-04 00:05:13 +01007094 do
7095 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007096#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01007097 if (size > INT_MAX) {
7098 chunk_size = INT_MAX;
7099 final = 0;
7100 done = 0;
7101 }
7102 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007103#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007104 {
7105 chunk_size = (int)size;
7106 final = (consumed == NULL);
7107 done = 1;
7108 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007109
Victor Stinner76a31a62011-11-04 00:05:13 +01007110 /* Skip trailing lead-byte unless 'final' is set */
7111 if (!final && is_dbcs_lead_byte(code_page, s, chunk_size - 1))
7112 --chunk_size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007113
Victor Stinner76a31a62011-11-04 00:05:13 +01007114 if (chunk_size == 0 && done) {
7115 if (v != NULL)
7116 break;
Serhiy Storchaka678db842013-01-26 12:16:36 +02007117 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner76a31a62011-11-04 00:05:13 +01007118 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007119
Victor Stinner76a31a62011-11-04 00:05:13 +01007120
7121 converted = decode_code_page_strict(code_page, &v,
7122 s, chunk_size);
7123 if (converted == -2)
7124 converted = decode_code_page_errors(code_page, &v,
7125 s, chunk_size,
7126 errors);
7127 assert(converted != 0);
7128
7129 if (converted < 0) {
7130 Py_XDECREF(v);
7131 return NULL;
7132 }
7133
7134 if (consumed)
7135 *consumed += converted;
7136
7137 s += converted;
7138 size -= converted;
7139 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007140
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007141 return unicode_result(v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007142}
7143
Alexander Belopolsky40018472011-02-26 01:02:56 +00007144PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007145PyUnicode_DecodeCodePageStateful(int code_page,
7146 const char *s,
7147 Py_ssize_t size,
7148 const char *errors,
7149 Py_ssize_t *consumed)
7150{
7151 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7152}
7153
7154PyObject *
7155PyUnicode_DecodeMBCSStateful(const char *s,
7156 Py_ssize_t size,
7157 const char *errors,
7158 Py_ssize_t *consumed)
7159{
7160 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7161}
7162
7163PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007164PyUnicode_DecodeMBCS(const char *s,
7165 Py_ssize_t size,
7166 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007167{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007168 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7169}
7170
Victor Stinner3a50e702011-10-18 21:21:00 +02007171static DWORD
7172encode_code_page_flags(UINT code_page, const char *errors)
7173{
7174 if (code_page == CP_UTF8) {
7175 if (winver.dwMajorVersion >= 6)
7176 /* CP_UTF8 supports WC_ERR_INVALID_CHARS on Windows Vista
7177 and later */
7178 return WC_ERR_INVALID_CHARS;
7179 else
7180 /* CP_UTF8 only supports flags=0 on Windows older than Vista */
7181 return 0;
7182 }
7183 else if (code_page == CP_UTF7) {
7184 /* CP_UTF7 only supports flags=0 */
7185 return 0;
7186 }
7187 else {
7188 if (errors != NULL && strcmp(errors, "replace") == 0)
7189 return 0;
7190 else
7191 return WC_NO_BEST_FIT_CHARS;
7192 }
7193}
7194
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007195/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007196 * Encode a Unicode string to a Windows code page into a byte string in strict
7197 * mode.
7198 *
7199 * Returns consumed characters if succeed, returns -2 on encode error, or raise
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007200 * an OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007201 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007202static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007203encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007204 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007205 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007206{
Victor Stinner554f3f02010-06-16 23:33:54 +00007207 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007208 BOOL *pusedDefaultChar = &usedDefaultChar;
7209 int outsize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007210 PyObject *exc = NULL;
Victor Stinner24729f32011-11-10 20:31:37 +01007211 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007212 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007213 const DWORD flags = encode_code_page_flags(code_page, NULL);
7214 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007215 /* Create a substring so that we can get the UTF-16 representation
7216 of just the slice under consideration. */
7217 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007218
Martin v. Löwis3d325192011-11-04 18:23:06 +01007219 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007220
Victor Stinner3a50e702011-10-18 21:21:00 +02007221 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007222 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007223 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007224 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007225
Victor Stinner2fc507f2011-11-04 20:06:39 +01007226 substring = PyUnicode_Substring(unicode, offset, offset+len);
7227 if (substring == NULL)
7228 return -1;
7229 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7230 if (p == NULL) {
7231 Py_DECREF(substring);
7232 return -1;
7233 }
Victor Stinner9f067f42013-06-05 00:21:31 +02007234 assert(size <= INT_MAX);
Martin v. Löwis3d325192011-11-04 18:23:06 +01007235
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007236 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007237 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007238 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007239 NULL, 0,
7240 NULL, pusedDefaultChar);
7241 if (outsize <= 0)
7242 goto error;
7243 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007244 if (pusedDefaultChar && *pusedDefaultChar) {
7245 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007246 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007247 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007248
Victor Stinner3a50e702011-10-18 21:21:00 +02007249 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007250 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007251 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007252 if (*outbytes == NULL) {
7253 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007254 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007255 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007256 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007257 }
7258 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007259 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007260 const Py_ssize_t n = PyBytes_Size(*outbytes);
7261 if (outsize > PY_SSIZE_T_MAX - n) {
7262 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007263 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007264 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007265 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007266 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7267 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007268 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007269 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007270 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007271 }
7272
7273 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007274 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007275 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007276 out, outsize,
7277 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007278 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007279 if (outsize <= 0)
7280 goto error;
7281 if (pusedDefaultChar && *pusedDefaultChar)
7282 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007283 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007284
Victor Stinner3a50e702011-10-18 21:21:00 +02007285error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007286 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007287 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7288 return -2;
7289 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007290 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007291}
7292
Victor Stinner3a50e702011-10-18 21:21:00 +02007293/*
7294 * Encode a Unicode string to a Windows code page into a byte string using a
7295 * error handler.
7296 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007297 * Returns consumed characters if succeed, or raise an OSError and returns
Victor Stinner3a50e702011-10-18 21:21:00 +02007298 * -1 on other error.
7299 */
7300static int
7301encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007302 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007303 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007304{
Victor Stinner3a50e702011-10-18 21:21:00 +02007305 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007306 Py_ssize_t pos = unicode_offset;
7307 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007308 /* Ideally, we should get reason from FormatMessage. This is the Windows
7309 2000 English version of the message. */
7310 const char *reason = "invalid character";
7311 /* 4=maximum length of a UTF-8 sequence */
7312 char buffer[4];
7313 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7314 Py_ssize_t outsize;
7315 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007316 PyObject *errorHandler = NULL;
7317 PyObject *exc = NULL;
7318 PyObject *encoding_obj = NULL;
7319 char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007320 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007321 PyObject *rep;
7322 int ret = -1;
7323
7324 assert(insize > 0);
7325
7326 encoding = code_page_name(code_page, &encoding_obj);
7327 if (encoding == NULL)
7328 return -1;
7329
7330 if (errors == NULL || strcmp(errors, "strict") == 0) {
7331 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7332 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007333 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007334 if (exc != NULL) {
7335 PyCodec_StrictErrors(exc);
7336 Py_DECREF(exc);
7337 }
7338 Py_XDECREF(encoding_obj);
7339 return -1;
7340 }
7341
7342 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7343 pusedDefaultChar = &usedDefaultChar;
7344 else
7345 pusedDefaultChar = NULL;
7346
7347 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7348 PyErr_NoMemory();
7349 goto error;
7350 }
7351 outsize = insize * Py_ARRAY_LENGTH(buffer);
7352
7353 if (*outbytes == NULL) {
7354 /* Create string object */
7355 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7356 if (*outbytes == NULL)
7357 goto error;
7358 out = PyBytes_AS_STRING(*outbytes);
7359 }
7360 else {
7361 /* Extend string object */
7362 Py_ssize_t n = PyBytes_Size(*outbytes);
7363 if (n > PY_SSIZE_T_MAX - outsize) {
7364 PyErr_NoMemory();
7365 goto error;
7366 }
7367 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7368 goto error;
7369 out = PyBytes_AS_STRING(*outbytes) + n;
7370 }
7371
7372 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007373 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007374 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007375 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7376 wchar_t chars[2];
7377 int charsize;
7378 if (ch < 0x10000) {
7379 chars[0] = (wchar_t)ch;
7380 charsize = 1;
7381 }
7382 else {
Victor Stinner76df43d2012-10-30 01:42:39 +01007383 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7384 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007385 charsize = 2;
7386 }
7387
Victor Stinner3a50e702011-10-18 21:21:00 +02007388 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007389 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007390 buffer, Py_ARRAY_LENGTH(buffer),
7391 NULL, pusedDefaultChar);
7392 if (outsize > 0) {
7393 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7394 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007395 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007396 memcpy(out, buffer, outsize);
7397 out += outsize;
7398 continue;
7399 }
7400 }
7401 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7402 PyErr_SetFromWindowsErr(0);
7403 goto error;
7404 }
7405
Victor Stinner3a50e702011-10-18 21:21:00 +02007406 rep = unicode_encode_call_errorhandler(
7407 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007408 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007409 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007410 if (rep == NULL)
7411 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007412 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007413
7414 if (PyBytes_Check(rep)) {
7415 outsize = PyBytes_GET_SIZE(rep);
7416 if (outsize != 1) {
7417 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7418 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7419 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7420 Py_DECREF(rep);
7421 goto error;
7422 }
7423 out = PyBytes_AS_STRING(*outbytes) + offset;
7424 }
7425 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7426 out += outsize;
7427 }
7428 else {
7429 Py_ssize_t i;
7430 enum PyUnicode_Kind kind;
7431 void *data;
7432
Benjamin Petersonbac79492012-01-14 13:34:47 -05007433 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007434 Py_DECREF(rep);
7435 goto error;
7436 }
7437
7438 outsize = PyUnicode_GET_LENGTH(rep);
7439 if (outsize != 1) {
7440 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7441 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7442 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7443 Py_DECREF(rep);
7444 goto error;
7445 }
7446 out = PyBytes_AS_STRING(*outbytes) + offset;
7447 }
7448 kind = PyUnicode_KIND(rep);
7449 data = PyUnicode_DATA(rep);
7450 for (i=0; i < outsize; i++) {
7451 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7452 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007453 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007454 encoding, unicode,
7455 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007456 "unable to encode error handler result to ASCII");
7457 Py_DECREF(rep);
7458 goto error;
7459 }
7460 *out = (unsigned char)ch;
7461 out++;
7462 }
7463 }
7464 Py_DECREF(rep);
7465 }
7466 /* write a NUL byte */
7467 *out = 0;
7468 outsize = out - PyBytes_AS_STRING(*outbytes);
7469 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7470 if (_PyBytes_Resize(outbytes, outsize) < 0)
7471 goto error;
7472 ret = 0;
7473
7474error:
7475 Py_XDECREF(encoding_obj);
7476 Py_XDECREF(errorHandler);
7477 Py_XDECREF(exc);
7478 return ret;
7479}
7480
Victor Stinner3a50e702011-10-18 21:21:00 +02007481static PyObject *
7482encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007483 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007484 const char *errors)
7485{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007486 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007487 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007488 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007489 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007490
Benjamin Petersonbac79492012-01-14 13:34:47 -05007491 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007492 return NULL;
7493 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007494
Victor Stinner3a50e702011-10-18 21:21:00 +02007495 if (code_page < 0) {
7496 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7497 return NULL;
7498 }
7499
Martin v. Löwis3d325192011-11-04 18:23:06 +01007500 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007501 return PyBytes_FromStringAndSize(NULL, 0);
7502
Victor Stinner7581cef2011-11-03 22:32:33 +01007503 offset = 0;
7504 do
7505 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007506#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007507 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007508 chunks. */
7509 if (len > INT_MAX/2) {
7510 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007511 done = 0;
7512 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007513 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007514#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007515 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007516 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007517 done = 1;
7518 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007519
Victor Stinner76a31a62011-11-04 00:05:13 +01007520 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007521 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007522 errors);
7523 if (ret == -2)
7524 ret = encode_code_page_errors(code_page, &outbytes,
7525 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007526 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007527 if (ret < 0) {
7528 Py_XDECREF(outbytes);
7529 return NULL;
7530 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007531
Victor Stinner7581cef2011-11-03 22:32:33 +01007532 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007533 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007534 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007535
Victor Stinner3a50e702011-10-18 21:21:00 +02007536 return outbytes;
7537}
7538
7539PyObject *
7540PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7541 Py_ssize_t size,
7542 const char *errors)
7543{
Victor Stinner7581cef2011-11-03 22:32:33 +01007544 PyObject *unicode, *res;
7545 unicode = PyUnicode_FromUnicode(p, size);
7546 if (unicode == NULL)
7547 return NULL;
7548 res = encode_code_page(CP_ACP, unicode, errors);
7549 Py_DECREF(unicode);
7550 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007551}
7552
7553PyObject *
7554PyUnicode_EncodeCodePage(int code_page,
7555 PyObject *unicode,
7556 const char *errors)
7557{
Victor Stinner7581cef2011-11-03 22:32:33 +01007558 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007559}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007560
Alexander Belopolsky40018472011-02-26 01:02:56 +00007561PyObject *
7562PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007563{
7564 if (!PyUnicode_Check(unicode)) {
7565 PyErr_BadArgument();
7566 return NULL;
7567 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007568 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007569}
7570
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007571#undef NEED_RETRY
7572
Victor Stinner99b95382011-07-04 14:23:54 +02007573#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007574
Guido van Rossumd57fd912000-03-10 22:53:23 +00007575/* --- Character Mapping Codec -------------------------------------------- */
7576
Victor Stinnerfb161b12013-04-18 01:44:27 +02007577static int
7578charmap_decode_string(const char *s,
7579 Py_ssize_t size,
7580 PyObject *mapping,
7581 const char *errors,
7582 _PyUnicodeWriter *writer)
7583{
7584 const char *starts = s;
7585 const char *e;
7586 Py_ssize_t startinpos, endinpos;
7587 PyObject *errorHandler = NULL, *exc = NULL;
7588 Py_ssize_t maplen;
7589 enum PyUnicode_Kind mapkind;
7590 void *mapdata;
7591 Py_UCS4 x;
7592 unsigned char ch;
7593
7594 if (PyUnicode_READY(mapping) == -1)
7595 return -1;
7596
7597 maplen = PyUnicode_GET_LENGTH(mapping);
7598 mapdata = PyUnicode_DATA(mapping);
7599 mapkind = PyUnicode_KIND(mapping);
7600
7601 e = s + size;
7602
7603 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
7604 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
7605 * is disabled in encoding aliases, latin1 is preferred because
7606 * its implementation is faster. */
7607 Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata;
7608 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7609 Py_UCS4 maxchar = writer->maxchar;
7610
7611 assert (writer->kind == PyUnicode_1BYTE_KIND);
7612 while (s < e) {
7613 ch = *s;
7614 x = mapdata_ucs1[ch];
7615 if (x > maxchar) {
7616 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
7617 goto onError;
7618 maxchar = writer->maxchar;
7619 outdata = (Py_UCS1 *)writer->data;
7620 }
7621 outdata[writer->pos] = x;
7622 writer->pos++;
7623 ++s;
7624 }
7625 return 0;
7626 }
7627
7628 while (s < e) {
7629 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
7630 enum PyUnicode_Kind outkind = writer->kind;
7631 Py_UCS2 *mapdata_ucs2 = (Py_UCS2 *)mapdata;
7632 if (outkind == PyUnicode_1BYTE_KIND) {
7633 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7634 Py_UCS4 maxchar = writer->maxchar;
7635 while (s < e) {
7636 ch = *s;
7637 x = mapdata_ucs2[ch];
7638 if (x > maxchar)
7639 goto Error;
7640 outdata[writer->pos] = x;
7641 writer->pos++;
7642 ++s;
7643 }
7644 break;
7645 }
7646 else if (outkind == PyUnicode_2BYTE_KIND) {
7647 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
7648 while (s < e) {
7649 ch = *s;
7650 x = mapdata_ucs2[ch];
7651 if (x == 0xFFFE)
7652 goto Error;
7653 outdata[writer->pos] = x;
7654 writer->pos++;
7655 ++s;
7656 }
7657 break;
7658 }
7659 }
7660 ch = *s;
7661
7662 if (ch < maplen)
7663 x = PyUnicode_READ(mapkind, mapdata, ch);
7664 else
7665 x = 0xfffe; /* invalid value */
7666Error:
7667 if (x == 0xfffe)
7668 {
7669 /* undefined mapping */
7670 startinpos = s-starts;
7671 endinpos = startinpos+1;
7672 if (unicode_decode_call_errorhandler_writer(
7673 errors, &errorHandler,
7674 "charmap", "character maps to <undefined>",
7675 &starts, &e, &startinpos, &endinpos, &exc, &s,
7676 writer)) {
7677 goto onError;
7678 }
7679 continue;
7680 }
7681
7682 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
7683 goto onError;
7684 ++s;
7685 }
7686 Py_XDECREF(errorHandler);
7687 Py_XDECREF(exc);
7688 return 0;
7689
7690onError:
7691 Py_XDECREF(errorHandler);
7692 Py_XDECREF(exc);
7693 return -1;
7694}
7695
7696static int
7697charmap_decode_mapping(const char *s,
7698 Py_ssize_t size,
7699 PyObject *mapping,
7700 const char *errors,
7701 _PyUnicodeWriter *writer)
7702{
7703 const char *starts = s;
7704 const char *e;
7705 Py_ssize_t startinpos, endinpos;
7706 PyObject *errorHandler = NULL, *exc = NULL;
7707 unsigned char ch;
Victor Stinnerf4f24242013-05-07 01:01:31 +02007708 PyObject *key, *item = NULL;
Victor Stinnerfb161b12013-04-18 01:44:27 +02007709
7710 e = s + size;
7711
7712 while (s < e) {
7713 ch = *s;
7714
7715 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7716 key = PyLong_FromLong((long)ch);
7717 if (key == NULL)
7718 goto onError;
7719
7720 item = PyObject_GetItem(mapping, key);
7721 Py_DECREF(key);
7722 if (item == NULL) {
7723 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7724 /* No mapping found means: mapping is undefined. */
7725 PyErr_Clear();
7726 goto Undefined;
7727 } else
7728 goto onError;
7729 }
7730
7731 /* Apply mapping */
7732 if (item == Py_None)
7733 goto Undefined;
7734 if (PyLong_Check(item)) {
7735 long value = PyLong_AS_LONG(item);
7736 if (value == 0xFFFE)
7737 goto Undefined;
7738 if (value < 0 || value > MAX_UNICODE) {
7739 PyErr_Format(PyExc_TypeError,
7740 "character mapping must be in range(0x%lx)",
7741 (unsigned long)MAX_UNICODE + 1);
7742 goto onError;
7743 }
7744
7745 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7746 goto onError;
7747 }
7748 else if (PyUnicode_Check(item)) {
7749 if (PyUnicode_READY(item) == -1)
7750 goto onError;
7751 if (PyUnicode_GET_LENGTH(item) == 1) {
7752 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
7753 if (value == 0xFFFE)
7754 goto Undefined;
7755 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7756 goto onError;
7757 }
7758 else {
7759 writer->overallocate = 1;
7760 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
7761 goto onError;
7762 }
7763 }
7764 else {
7765 /* wrong return value */
7766 PyErr_SetString(PyExc_TypeError,
7767 "character mapping must return integer, None or str");
7768 goto onError;
7769 }
7770 Py_CLEAR(item);
7771 ++s;
7772 continue;
7773
7774Undefined:
7775 /* undefined mapping */
7776 Py_CLEAR(item);
7777 startinpos = s-starts;
7778 endinpos = startinpos+1;
7779 if (unicode_decode_call_errorhandler_writer(
7780 errors, &errorHandler,
7781 "charmap", "character maps to <undefined>",
7782 &starts, &e, &startinpos, &endinpos, &exc, &s,
7783 writer)) {
7784 goto onError;
7785 }
7786 }
7787 Py_XDECREF(errorHandler);
7788 Py_XDECREF(exc);
7789 return 0;
7790
7791onError:
7792 Py_XDECREF(item);
7793 Py_XDECREF(errorHandler);
7794 Py_XDECREF(exc);
7795 return -1;
7796}
7797
Alexander Belopolsky40018472011-02-26 01:02:56 +00007798PyObject *
7799PyUnicode_DecodeCharmap(const char *s,
7800 Py_ssize_t size,
7801 PyObject *mapping,
7802 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007803{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007804 _PyUnicodeWriter writer;
Tim Petersced69f82003-09-16 20:30:58 +00007805
Guido van Rossumd57fd912000-03-10 22:53:23 +00007806 /* Default to Latin-1 */
7807 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007808 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007809
Guido van Rossumd57fd912000-03-10 22:53:23 +00007810 if (size == 0)
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02007811 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner8f674cc2013-04-17 23:02:17 +02007812 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02007813 writer.min_length = size;
7814 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007815 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007816
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007817 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerfb161b12013-04-18 01:44:27 +02007818 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
7819 goto onError;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007820 }
7821 else {
Victor Stinnerfb161b12013-04-18 01:44:27 +02007822 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
7823 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007824 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007825 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00007826
Benjamin Peterson29060642009-01-31 22:14:21 +00007827 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007828 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007829 return NULL;
7830}
7831
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007832/* Charmap encoding: the lookup table */
7833
Alexander Belopolsky40018472011-02-26 01:02:56 +00007834struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00007835 PyObject_HEAD
7836 unsigned char level1[32];
7837 int count2, count3;
7838 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007839};
7840
7841static PyObject*
7842encoding_map_size(PyObject *obj, PyObject* args)
7843{
7844 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007845 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00007846 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007847}
7848
7849static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007850 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00007851 PyDoc_STR("Return the size (in bytes) of this object") },
7852 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007853};
7854
7855static void
7856encoding_map_dealloc(PyObject* o)
7857{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007858 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007859}
7860
7861static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007862 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007863 "EncodingMap", /*tp_name*/
7864 sizeof(struct encoding_map), /*tp_basicsize*/
7865 0, /*tp_itemsize*/
7866 /* methods */
7867 encoding_map_dealloc, /*tp_dealloc*/
7868 0, /*tp_print*/
7869 0, /*tp_getattr*/
7870 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00007871 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00007872 0, /*tp_repr*/
7873 0, /*tp_as_number*/
7874 0, /*tp_as_sequence*/
7875 0, /*tp_as_mapping*/
7876 0, /*tp_hash*/
7877 0, /*tp_call*/
7878 0, /*tp_str*/
7879 0, /*tp_getattro*/
7880 0, /*tp_setattro*/
7881 0, /*tp_as_buffer*/
7882 Py_TPFLAGS_DEFAULT, /*tp_flags*/
7883 0, /*tp_doc*/
7884 0, /*tp_traverse*/
7885 0, /*tp_clear*/
7886 0, /*tp_richcompare*/
7887 0, /*tp_weaklistoffset*/
7888 0, /*tp_iter*/
7889 0, /*tp_iternext*/
7890 encoding_map_methods, /*tp_methods*/
7891 0, /*tp_members*/
7892 0, /*tp_getset*/
7893 0, /*tp_base*/
7894 0, /*tp_dict*/
7895 0, /*tp_descr_get*/
7896 0, /*tp_descr_set*/
7897 0, /*tp_dictoffset*/
7898 0, /*tp_init*/
7899 0, /*tp_alloc*/
7900 0, /*tp_new*/
7901 0, /*tp_free*/
7902 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007903};
7904
7905PyObject*
7906PyUnicode_BuildEncodingMap(PyObject* string)
7907{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007908 PyObject *result;
7909 struct encoding_map *mresult;
7910 int i;
7911 int need_dict = 0;
7912 unsigned char level1[32];
7913 unsigned char level2[512];
7914 unsigned char *mlevel1, *mlevel2, *mlevel3;
7915 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007916 int kind;
7917 void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007918 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007919 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007920
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007921 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007922 PyErr_BadArgument();
7923 return NULL;
7924 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007925 kind = PyUnicode_KIND(string);
7926 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007927 length = PyUnicode_GET_LENGTH(string);
7928 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007929 memset(level1, 0xFF, sizeof level1);
7930 memset(level2, 0xFF, sizeof level2);
7931
7932 /* If there isn't a one-to-one mapping of NULL to \0,
7933 or if there are non-BMP characters, we need to use
7934 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007935 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007936 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007937 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007938 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007939 ch = PyUnicode_READ(kind, data, i);
7940 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007941 need_dict = 1;
7942 break;
7943 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007944 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007945 /* unmapped character */
7946 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007947 l1 = ch >> 11;
7948 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007949 if (level1[l1] == 0xFF)
7950 level1[l1] = count2++;
7951 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00007952 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007953 }
7954
7955 if (count2 >= 0xFF || count3 >= 0xFF)
7956 need_dict = 1;
7957
7958 if (need_dict) {
7959 PyObject *result = PyDict_New();
7960 PyObject *key, *value;
7961 if (!result)
7962 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007963 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007964 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00007965 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007966 if (!key || !value)
7967 goto failed1;
7968 if (PyDict_SetItem(result, key, value) == -1)
7969 goto failed1;
7970 Py_DECREF(key);
7971 Py_DECREF(value);
7972 }
7973 return result;
7974 failed1:
7975 Py_XDECREF(key);
7976 Py_XDECREF(value);
7977 Py_DECREF(result);
7978 return NULL;
7979 }
7980
7981 /* Create a three-level trie */
7982 result = PyObject_MALLOC(sizeof(struct encoding_map) +
7983 16*count2 + 128*count3 - 1);
7984 if (!result)
7985 return PyErr_NoMemory();
7986 PyObject_Init(result, &EncodingMapType);
7987 mresult = (struct encoding_map*)result;
7988 mresult->count2 = count2;
7989 mresult->count3 = count3;
7990 mlevel1 = mresult->level1;
7991 mlevel2 = mresult->level23;
7992 mlevel3 = mresult->level23 + 16*count2;
7993 memcpy(mlevel1, level1, 32);
7994 memset(mlevel2, 0xFF, 16*count2);
7995 memset(mlevel3, 0, 128*count3);
7996 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007997 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007998 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007999 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8000 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008001 /* unmapped character */
8002 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008003 o1 = ch>>11;
8004 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008005 i2 = 16*mlevel1[o1] + o2;
8006 if (mlevel2[i2] == 0xFF)
8007 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008008 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008009 i3 = 128*mlevel2[i2] + o3;
8010 mlevel3[i3] = i;
8011 }
8012 return result;
8013}
8014
8015static int
Victor Stinner22168992011-11-20 17:09:18 +01008016encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008017{
8018 struct encoding_map *map = (struct encoding_map*)mapping;
8019 int l1 = c>>11;
8020 int l2 = (c>>7) & 0xF;
8021 int l3 = c & 0x7F;
8022 int i;
8023
Victor Stinner22168992011-11-20 17:09:18 +01008024 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00008025 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008026 if (c == 0)
8027 return 0;
8028 /* level 1*/
8029 i = map->level1[l1];
8030 if (i == 0xFF) {
8031 return -1;
8032 }
8033 /* level 2*/
8034 i = map->level23[16*i+l2];
8035 if (i == 0xFF) {
8036 return -1;
8037 }
8038 /* level 3 */
8039 i = map->level23[16*map->count2 + 128*i + l3];
8040 if (i == 0) {
8041 return -1;
8042 }
8043 return i;
8044}
8045
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008046/* Lookup the character ch in the mapping. If the character
8047 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00008048 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008049static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01008050charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008051{
Christian Heimes217cfd12007-12-02 14:31:20 +00008052 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008053 PyObject *x;
8054
8055 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008056 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008057 x = PyObject_GetItem(mapping, w);
8058 Py_DECREF(w);
8059 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008060 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8061 /* No mapping found means: mapping is undefined. */
8062 PyErr_Clear();
8063 x = Py_None;
8064 Py_INCREF(x);
8065 return x;
8066 } else
8067 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008068 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008069 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008070 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008071 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008072 long value = PyLong_AS_LONG(x);
8073 if (value < 0 || value > 255) {
8074 PyErr_SetString(PyExc_TypeError,
8075 "character mapping must be in range(256)");
8076 Py_DECREF(x);
8077 return NULL;
8078 }
8079 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008080 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008081 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008082 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008083 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008084 /* wrong return value */
8085 PyErr_Format(PyExc_TypeError,
8086 "character mapping must return integer, bytes or None, not %.400s",
8087 x->ob_type->tp_name);
8088 Py_DECREF(x);
8089 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008090 }
8091}
8092
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008093static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008094charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008095{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008096 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8097 /* exponentially overallocate to minimize reallocations */
8098 if (requiredsize < 2*outsize)
8099 requiredsize = 2*outsize;
8100 if (_PyBytes_Resize(outobj, requiredsize))
8101 return -1;
8102 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008103}
8104
Benjamin Peterson14339b62009-01-31 16:36:08 +00008105typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008106 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008107} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008108/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008109 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008110 space is available. Return a new reference to the object that
8111 was put in the output buffer, or Py_None, if the mapping was undefined
8112 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008113 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008114static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008115charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008116 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008117{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008118 PyObject *rep;
8119 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008120 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008121
Christian Heimes90aa7642007-12-19 02:45:37 +00008122 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008123 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008124 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008125 if (res == -1)
8126 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008127 if (outsize<requiredsize)
8128 if (charmapencode_resize(outobj, outpos, requiredsize))
8129 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008130 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008131 outstart[(*outpos)++] = (char)res;
8132 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008133 }
8134
8135 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008136 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008137 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008138 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008139 Py_DECREF(rep);
8140 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008141 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008142 if (PyLong_Check(rep)) {
8143 Py_ssize_t requiredsize = *outpos+1;
8144 if (outsize<requiredsize)
8145 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8146 Py_DECREF(rep);
8147 return enc_EXCEPTION;
8148 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008149 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008150 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008151 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008152 else {
8153 const char *repchars = PyBytes_AS_STRING(rep);
8154 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8155 Py_ssize_t requiredsize = *outpos+repsize;
8156 if (outsize<requiredsize)
8157 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8158 Py_DECREF(rep);
8159 return enc_EXCEPTION;
8160 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008161 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008162 memcpy(outstart + *outpos, repchars, repsize);
8163 *outpos += repsize;
8164 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008165 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008166 Py_DECREF(rep);
8167 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008168}
8169
8170/* handle an error in PyUnicode_EncodeCharmap
8171 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008172static int
8173charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008174 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008175 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00008176 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008177 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008178{
8179 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008180 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008181 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008182 enum PyUnicode_Kind kind;
8183 void *data;
8184 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008185 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008186 Py_ssize_t collstartpos = *inpos;
8187 Py_ssize_t collendpos = *inpos+1;
8188 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008189 char *encoding = "charmap";
8190 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008191 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008192 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008193 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008194
Benjamin Petersonbac79492012-01-14 13:34:47 -05008195 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008196 return -1;
8197 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008198 /* find all unencodable characters */
8199 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008200 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008201 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008202 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008203 val = encoding_map_lookup(ch, mapping);
8204 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008205 break;
8206 ++collendpos;
8207 continue;
8208 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008209
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008210 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8211 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008212 if (rep==NULL)
8213 return -1;
8214 else if (rep!=Py_None) {
8215 Py_DECREF(rep);
8216 break;
8217 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008218 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008219 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008220 }
8221 /* cache callback name lookup
8222 * (if not done yet, i.e. it's the first error) */
8223 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008224 if ((errors==NULL) || (!strcmp(errors, "strict")))
8225 *known_errorHandler = 1;
8226 else if (!strcmp(errors, "replace"))
8227 *known_errorHandler = 2;
8228 else if (!strcmp(errors, "ignore"))
8229 *known_errorHandler = 3;
8230 else if (!strcmp(errors, "xmlcharrefreplace"))
8231 *known_errorHandler = 4;
8232 else
8233 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008234 }
8235 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008236 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008237 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008238 return -1;
8239 case 2: /* replace */
8240 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008241 x = charmapencode_output('?', mapping, res, respos);
8242 if (x==enc_EXCEPTION) {
8243 return -1;
8244 }
8245 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008246 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008247 return -1;
8248 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008249 }
8250 /* fall through */
8251 case 3: /* ignore */
8252 *inpos = collendpos;
8253 break;
8254 case 4: /* xmlcharrefreplace */
8255 /* generate replacement (temporarily (mis)uses p) */
8256 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008257 char buffer[2+29+1+1];
8258 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008259 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008260 for (cp = buffer; *cp; ++cp) {
8261 x = charmapencode_output(*cp, mapping, res, respos);
8262 if (x==enc_EXCEPTION)
8263 return -1;
8264 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008265 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008266 return -1;
8267 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008268 }
8269 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008270 *inpos = collendpos;
8271 break;
8272 default:
8273 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008274 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008275 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008276 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008277 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008278 if (PyBytes_Check(repunicode)) {
8279 /* Directly copy bytes result to output. */
8280 Py_ssize_t outsize = PyBytes_Size(*res);
8281 Py_ssize_t requiredsize;
8282 repsize = PyBytes_Size(repunicode);
8283 requiredsize = *respos + repsize;
8284 if (requiredsize > outsize)
8285 /* Make room for all additional bytes. */
8286 if (charmapencode_resize(res, respos, requiredsize)) {
8287 Py_DECREF(repunicode);
8288 return -1;
8289 }
8290 memcpy(PyBytes_AsString(*res) + *respos,
8291 PyBytes_AsString(repunicode), repsize);
8292 *respos += repsize;
8293 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008294 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008295 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008296 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008297 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008298 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008299 Py_DECREF(repunicode);
8300 return -1;
8301 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008302 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008303 data = PyUnicode_DATA(repunicode);
8304 kind = PyUnicode_KIND(repunicode);
8305 for (index = 0; index < repsize; index++) {
8306 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8307 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008308 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008309 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008310 return -1;
8311 }
8312 else if (x==enc_FAILED) {
8313 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008314 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008315 return -1;
8316 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008317 }
8318 *inpos = newpos;
8319 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008320 }
8321 return 0;
8322}
8323
Alexander Belopolsky40018472011-02-26 01:02:56 +00008324PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008325_PyUnicode_EncodeCharmap(PyObject *unicode,
8326 PyObject *mapping,
8327 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008328{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008329 /* output object */
8330 PyObject *res = NULL;
8331 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008332 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008333 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008334 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008335 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008336 PyObject *errorHandler = NULL;
8337 PyObject *exc = NULL;
8338 /* the following variable is used for caching string comparisons
8339 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8340 * 3=ignore, 4=xmlcharrefreplace */
8341 int known_errorHandler = -1;
Victor Stinner69ed0f42013-04-09 21:48:24 +02008342 void *data;
8343 int kind;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008344
Benjamin Petersonbac79492012-01-14 13:34:47 -05008345 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008346 return NULL;
8347 size = PyUnicode_GET_LENGTH(unicode);
Victor Stinner69ed0f42013-04-09 21:48:24 +02008348 data = PyUnicode_DATA(unicode);
8349 kind = PyUnicode_KIND(unicode);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008350
Guido van Rossumd57fd912000-03-10 22:53:23 +00008351 /* Default to Latin-1 */
8352 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008353 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008354
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008355 /* allocate enough for a simple encoding without
8356 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008357 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008358 if (res == NULL)
8359 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008360 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008361 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008362
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008363 while (inpos<size) {
Victor Stinner69ed0f42013-04-09 21:48:24 +02008364 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008365 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008366 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008367 if (x==enc_EXCEPTION) /* error */
8368 goto onError;
8369 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008370 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008371 &exc,
8372 &known_errorHandler, &errorHandler, errors,
8373 &res, &respos)) {
8374 goto onError;
8375 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008376 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008377 else
8378 /* done with this character => adjust input position */
8379 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008380 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008381
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008382 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008383 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008384 if (_PyBytes_Resize(&res, respos) < 0)
8385 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008386
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008387 Py_XDECREF(exc);
8388 Py_XDECREF(errorHandler);
8389 return res;
8390
Benjamin Peterson29060642009-01-31 22:14:21 +00008391 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008392 Py_XDECREF(res);
8393 Py_XDECREF(exc);
8394 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008395 return NULL;
8396}
8397
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008398/* Deprecated */
8399PyObject *
8400PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8401 Py_ssize_t size,
8402 PyObject *mapping,
8403 const char *errors)
8404{
8405 PyObject *result;
8406 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8407 if (unicode == NULL)
8408 return NULL;
8409 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8410 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008411 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008412}
8413
Alexander Belopolsky40018472011-02-26 01:02:56 +00008414PyObject *
8415PyUnicode_AsCharmapString(PyObject *unicode,
8416 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008417{
8418 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008419 PyErr_BadArgument();
8420 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008421 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008422 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008423}
8424
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008425/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008426static void
8427make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008428 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008429 Py_ssize_t startpos, Py_ssize_t endpos,
8430 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008431{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008432 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008433 *exceptionObject = _PyUnicodeTranslateError_Create(
8434 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008435 }
8436 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008437 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8438 goto onError;
8439 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8440 goto onError;
8441 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8442 goto onError;
8443 return;
8444 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02008445 Py_CLEAR(*exceptionObject);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008446 }
8447}
8448
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008449/* error handling callback helper:
8450 build arguments, call the callback and check the arguments,
8451 put the result into newpos and return the replacement string, which
8452 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008453static PyObject *
8454unicode_translate_call_errorhandler(const char *errors,
8455 PyObject **errorHandler,
8456 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008457 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008458 Py_ssize_t startpos, Py_ssize_t endpos,
8459 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008460{
Benjamin Peterson142957c2008-07-04 19:55:29 +00008461 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008462
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008463 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008464 PyObject *restuple;
8465 PyObject *resunicode;
8466
8467 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008468 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008469 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008470 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008471 }
8472
8473 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008474 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008475 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008476 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008477
8478 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00008479 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008480 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008481 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008482 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00008483 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008484 Py_DECREF(restuple);
8485 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008486 }
8487 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00008488 &resunicode, &i_newpos)) {
8489 Py_DECREF(restuple);
8490 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008491 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008492 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008493 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008494 else
8495 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008496 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnera33bce02014-07-04 22:47:46 +02008497 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008498 Py_DECREF(restuple);
8499 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008500 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008501 Py_INCREF(resunicode);
8502 Py_DECREF(restuple);
8503 return resunicode;
8504}
8505
8506/* Lookup the character ch in the mapping and put the result in result,
8507 which must be decrefed by the caller.
8508 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008509static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008510charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008511{
Christian Heimes217cfd12007-12-02 14:31:20 +00008512 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008513 PyObject *x;
8514
8515 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008516 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008517 x = PyObject_GetItem(mapping, w);
8518 Py_DECREF(w);
8519 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008520 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8521 /* No mapping found means: use 1:1 mapping. */
8522 PyErr_Clear();
8523 *result = NULL;
8524 return 0;
8525 } else
8526 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008527 }
8528 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008529 *result = x;
8530 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008531 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008532 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008533 long value = PyLong_AS_LONG(x);
8534 long max = PyUnicode_GetMax();
8535 if (value < 0 || value > max) {
8536 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00008537 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008538 Py_DECREF(x);
8539 return -1;
8540 }
8541 *result = x;
8542 return 0;
8543 }
8544 else if (PyUnicode_Check(x)) {
8545 *result = x;
8546 return 0;
8547 }
8548 else {
8549 /* wrong return value */
8550 PyErr_SetString(PyExc_TypeError,
8551 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008552 Py_DECREF(x);
8553 return -1;
8554 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008555}
8556/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00008557 if not reallocate and adjust various state variables.
8558 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008559static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008560charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00008561 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008562{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008563 Py_ssize_t oldsize = *psize;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +00008564 Py_UCS4 *new_outobj;
Walter Dörwald4894c302003-10-24 14:25:28 +00008565 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008566 /* exponentially overallocate to minimize reallocations */
8567 if (requiredsize < 2 * oldsize)
8568 requiredsize = 2 * oldsize;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +00008569 new_outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
8570 if (new_outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008571 return -1;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +00008572 *outobj = new_outobj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008573 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008574 }
8575 return 0;
8576}
8577/* lookup the character, put the result in the output string and adjust
8578 various state variables. Return a new reference to the object that
8579 was put in the output buffer in *result, or Py_None, if the mapping was
8580 undefined (in which case no character was written).
8581 The called must decref result.
8582 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008583static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008584charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
8585 PyObject *mapping, Py_UCS4 **output,
8586 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008587 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008588{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008589 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
8590 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00008591 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008592 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008593 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008594 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008595 }
8596 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008597 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00008598 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008599 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008600 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008601 }
8602 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008603 Py_ssize_t repsize;
8604 if (PyUnicode_READY(*res) == -1)
8605 return -1;
8606 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00008607 if (repsize==1) {
8608 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008609 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00008610 }
8611 else if (repsize!=0) {
8612 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008613 Py_ssize_t requiredsize = *opos +
8614 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00008615 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008616 Py_ssize_t i;
8617 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00008618 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008619 for(i = 0; i < repsize; i++)
8620 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00008621 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008622 }
8623 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008624 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008625 return 0;
8626}
8627
Alexander Belopolsky40018472011-02-26 01:02:56 +00008628PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008629_PyUnicode_TranslateCharmap(PyObject *input,
8630 PyObject *mapping,
8631 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008632{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008633 /* input object */
8634 char *idata;
8635 Py_ssize_t size, i;
8636 int kind;
8637 /* output buffer */
8638 Py_UCS4 *output = NULL;
8639 Py_ssize_t osize;
8640 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008641 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008642 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008643 char *reason = "character maps to <undefined>";
8644 PyObject *errorHandler = NULL;
8645 PyObject *exc = NULL;
8646 /* the following variable is used for caching string comparisons
8647 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8648 * 3=ignore, 4=xmlcharrefreplace */
8649 int known_errorHandler = -1;
8650
Guido van Rossumd57fd912000-03-10 22:53:23 +00008651 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008652 PyErr_BadArgument();
8653 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008654 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008655
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008656 if (PyUnicode_READY(input) == -1)
8657 return NULL;
8658 idata = (char*)PyUnicode_DATA(input);
8659 kind = PyUnicode_KIND(input);
8660 size = PyUnicode_GET_LENGTH(input);
8661 i = 0;
8662
8663 if (size == 0) {
8664 Py_INCREF(input);
8665 return input;
8666 }
8667
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008668 /* allocate enough for a simple 1:1 translation without
8669 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008670 osize = size;
8671 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
8672 opos = 0;
8673 if (output == NULL) {
8674 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00008675 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008676 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008677
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008678 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008679 /* try to encode it */
8680 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008681 if (charmaptranslate_output(input, i, mapping,
8682 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008683 Py_XDECREF(x);
8684 goto onError;
8685 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008686 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00008687 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008688 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00008689 else { /* untranslatable character */
8690 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8691 Py_ssize_t repsize;
8692 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008693 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00008694 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008695 Py_ssize_t collstart = i;
8696 Py_ssize_t collend = i+1;
8697 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008698
Benjamin Peterson29060642009-01-31 22:14:21 +00008699 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008700 while (collend < size) {
8701 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008702 goto onError;
8703 Py_XDECREF(x);
8704 if (x!=Py_None)
8705 break;
8706 ++collend;
8707 }
8708 /* cache callback name lookup
8709 * (if not done yet, i.e. it's the first error) */
8710 if (known_errorHandler==-1) {
8711 if ((errors==NULL) || (!strcmp(errors, "strict")))
8712 known_errorHandler = 1;
8713 else if (!strcmp(errors, "replace"))
8714 known_errorHandler = 2;
8715 else if (!strcmp(errors, "ignore"))
8716 known_errorHandler = 3;
8717 else if (!strcmp(errors, "xmlcharrefreplace"))
8718 known_errorHandler = 4;
8719 else
8720 known_errorHandler = 0;
8721 }
8722 switch (known_errorHandler) {
8723 case 1: /* strict */
Victor Stinner6fa62752012-10-23 02:51:50 +02008724 make_translate_exception(&exc,
8725 input, collstart, collend, reason);
8726 if (exc != NULL)
8727 PyCodec_StrictErrors(exc);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008728 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00008729 case 2: /* replace */
8730 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008731 for (coll = collstart; coll<collend; coll++)
8732 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00008733 /* fall through */
8734 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008735 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008736 break;
8737 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008738 /* generate replacement (temporarily (mis)uses i) */
8739 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008740 char buffer[2+29+1+1];
8741 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008742 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
8743 if (charmaptranslate_makespace(&output, &osize,
8744 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00008745 goto onError;
8746 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008747 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00008748 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008749 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008750 break;
8751 default:
8752 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008753 reason, input, &exc,
8754 collstart, collend, &newpos);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008755 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008756 goto onError;
Benjamin Peterson9ca3ffa2012-01-01 16:04:29 -06008757 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008758 Py_DECREF(repunicode);
8759 goto onError;
8760 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008761 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008762 repsize = PyUnicode_GET_LENGTH(repunicode);
8763 if (charmaptranslate_makespace(&output, &osize,
8764 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008765 Py_DECREF(repunicode);
8766 goto onError;
8767 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008768 for (uni2 = 0; repsize-->0; ++uni2)
8769 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
8770 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00008771 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008772 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008773 }
8774 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008775 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
8776 if (!res)
8777 goto onError;
8778 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008779 Py_XDECREF(exc);
8780 Py_XDECREF(errorHandler);
8781 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008782
Benjamin Peterson29060642009-01-31 22:14:21 +00008783 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008784 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008785 Py_XDECREF(exc);
8786 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008787 return NULL;
8788}
8789
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008790/* Deprecated. Use PyUnicode_Translate instead. */
8791PyObject *
8792PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8793 Py_ssize_t size,
8794 PyObject *mapping,
8795 const char *errors)
8796{
Christian Heimes5f520f42012-09-11 14:03:25 +02008797 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008798 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8799 if (!unicode)
8800 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02008801 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8802 Py_DECREF(unicode);
8803 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008804}
8805
Alexander Belopolsky40018472011-02-26 01:02:56 +00008806PyObject *
8807PyUnicode_Translate(PyObject *str,
8808 PyObject *mapping,
8809 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008810{
8811 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008812
Guido van Rossumd57fd912000-03-10 22:53:23 +00008813 str = PyUnicode_FromObject(str);
8814 if (str == NULL)
Christian Heimes5f520f42012-09-11 14:03:25 +02008815 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008816 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008817 Py_DECREF(str);
8818 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008819}
Tim Petersced69f82003-09-16 20:30:58 +00008820
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008821static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008822fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008823{
8824 /* No need to call PyUnicode_READY(self) because this function is only
8825 called as a callback from fixup() which does it already. */
8826 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8827 const int kind = PyUnicode_KIND(self);
8828 void *data = PyUnicode_DATA(self);
Victor Stinnere6abb482012-05-02 01:15:40 +02008829 Py_UCS4 maxchar = 127, ch, fixed;
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008830 int modified = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008831 Py_ssize_t i;
8832
8833 for (i = 0; i < len; ++i) {
8834 ch = PyUnicode_READ(kind, data, i);
8835 fixed = 0;
8836 if (ch > 127) {
8837 if (Py_UNICODE_ISSPACE(ch))
8838 fixed = ' ';
8839 else {
8840 const int decimal = Py_UNICODE_TODECIMAL(ch);
8841 if (decimal >= 0)
8842 fixed = '0' + decimal;
8843 }
8844 if (fixed != 0) {
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008845 modified = 1;
Benjamin Peterson7e303732013-06-10 09:19:46 -07008846 maxchar = Py_MAX(maxchar, fixed);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008847 PyUnicode_WRITE(kind, data, i, fixed);
8848 }
Victor Stinnere6abb482012-05-02 01:15:40 +02008849 else
Benjamin Peterson7e303732013-06-10 09:19:46 -07008850 maxchar = Py_MAX(maxchar, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008851 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008852 }
8853
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008854 return (modified) ? maxchar : 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008855}
8856
8857PyObject *
8858_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8859{
8860 if (!PyUnicode_Check(unicode)) {
8861 PyErr_BadInternalCall();
8862 return NULL;
8863 }
8864 if (PyUnicode_READY(unicode) == -1)
8865 return NULL;
8866 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8867 /* If the string is already ASCII, just return the same string */
8868 Py_INCREF(unicode);
8869 return unicode;
8870 }
Victor Stinner9310abb2011-10-05 00:59:23 +02008871 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008872}
8873
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008874PyObject *
8875PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8876 Py_ssize_t length)
8877{
Victor Stinnerf0124502011-11-21 23:12:56 +01008878 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008879 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01008880 Py_UCS4 maxchar;
8881 enum PyUnicode_Kind kind;
8882 void *data;
8883
Victor Stinner99d7ad02012-02-22 13:37:39 +01008884 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008885 for (i = 0; i < length; i++) {
Victor Stinnerf0124502011-11-21 23:12:56 +01008886 Py_UNICODE ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008887 if (ch > 127) {
8888 int decimal = Py_UNICODE_TODECIMAL(ch);
8889 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01008890 ch = '0' + decimal;
Benjamin Peterson7e303732013-06-10 09:19:46 -07008891 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008892 }
8893 }
Victor Stinnerf0124502011-11-21 23:12:56 +01008894
8895 /* Copy to a new string */
8896 decimal = PyUnicode_New(length, maxchar);
8897 if (decimal == NULL)
8898 return decimal;
8899 kind = PyUnicode_KIND(decimal);
8900 data = PyUnicode_DATA(decimal);
8901 /* Iterate over code points */
8902 for (i = 0; i < length; i++) {
8903 Py_UNICODE ch = s[i];
8904 if (ch > 127) {
8905 int decimal = Py_UNICODE_TODECIMAL(ch);
8906 if (decimal >= 0)
8907 ch = '0' + decimal;
8908 }
8909 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008910 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008911 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008912}
Guido van Rossum9e896b32000-04-05 20:11:21 +00008913/* --- Decimal Encoder ---------------------------------------------------- */
8914
Alexander Belopolsky40018472011-02-26 01:02:56 +00008915int
8916PyUnicode_EncodeDecimal(Py_UNICODE *s,
8917 Py_ssize_t length,
8918 char *output,
8919 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00008920{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008921 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01008922 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01008923 enum PyUnicode_Kind kind;
8924 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008925
8926 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008927 PyErr_BadArgument();
8928 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008929 }
8930
Victor Stinner42bf7752011-11-21 22:52:58 +01008931 unicode = PyUnicode_FromUnicode(s, length);
8932 if (unicode == NULL)
8933 return -1;
8934
Benjamin Petersonbac79492012-01-14 13:34:47 -05008935 if (PyUnicode_READY(unicode) == -1) {
Victor Stinner6345be92011-11-25 20:09:01 +01008936 Py_DECREF(unicode);
8937 return -1;
8938 }
Victor Stinner42bf7752011-11-21 22:52:58 +01008939 kind = PyUnicode_KIND(unicode);
8940 data = PyUnicode_DATA(unicode);
8941
Victor Stinnerb84d7232011-11-22 01:50:07 +01008942 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01008943 PyObject *exc;
8944 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00008945 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01008946 Py_ssize_t startpos;
8947
8948 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00008949
Benjamin Peterson29060642009-01-31 22:14:21 +00008950 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008951 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01008952 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008953 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008954 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008955 decimal = Py_UNICODE_TODECIMAL(ch);
8956 if (decimal >= 0) {
8957 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008958 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008959 continue;
8960 }
8961 if (0 < ch && ch < 256) {
8962 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008963 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008964 continue;
8965 }
Victor Stinner6345be92011-11-25 20:09:01 +01008966
Victor Stinner42bf7752011-11-21 22:52:58 +01008967 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01008968 exc = NULL;
8969 raise_encode_exception(&exc, "decimal", unicode,
8970 startpos, startpos+1,
8971 "invalid decimal Unicode string");
8972 Py_XDECREF(exc);
8973 Py_DECREF(unicode);
8974 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008975 }
8976 /* 0-terminate the output string */
8977 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01008978 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008979 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008980}
8981
Guido van Rossumd57fd912000-03-10 22:53:23 +00008982/* --- Helpers ------------------------------------------------------------ */
8983
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008984static Py_ssize_t
Victor Stinner794d5672011-10-10 03:21:36 +02008985any_find_slice(int direction, PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008986 Py_ssize_t start,
8987 Py_ssize_t end)
8988{
8989 int kind1, kind2, kind;
8990 void *buf1, *buf2;
8991 Py_ssize_t len1, len2, result;
8992
8993 kind1 = PyUnicode_KIND(s1);
8994 kind2 = PyUnicode_KIND(s2);
8995 kind = kind1 > kind2 ? kind1 : kind2;
8996 buf1 = PyUnicode_DATA(s1);
8997 buf2 = PyUnicode_DATA(s2);
8998 if (kind1 != kind)
8999 buf1 = _PyUnicode_AsKind(s1, kind);
9000 if (!buf1)
9001 return -2;
9002 if (kind2 != kind)
9003 buf2 = _PyUnicode_AsKind(s2, kind);
9004 if (!buf2) {
9005 if (kind1 != kind) PyMem_Free(buf1);
9006 return -2;
9007 }
9008 len1 = PyUnicode_GET_LENGTH(s1);
9009 len2 = PyUnicode_GET_LENGTH(s2);
9010
Victor Stinner794d5672011-10-10 03:21:36 +02009011 if (direction > 0) {
Benjamin Petersonead6b532011-12-20 17:23:42 -06009012 switch (kind) {
Victor Stinner794d5672011-10-10 03:21:36 +02009013 case PyUnicode_1BYTE_KIND:
9014 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9015 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9016 else
9017 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9018 break;
9019 case PyUnicode_2BYTE_KIND:
9020 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9021 break;
9022 case PyUnicode_4BYTE_KIND:
9023 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9024 break;
9025 default:
9026 assert(0); result = -2;
9027 }
9028 }
9029 else {
Benjamin Petersonead6b532011-12-20 17:23:42 -06009030 switch (kind) {
Victor Stinner794d5672011-10-10 03:21:36 +02009031 case PyUnicode_1BYTE_KIND:
9032 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9033 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9034 else
9035 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9036 break;
9037 case PyUnicode_2BYTE_KIND:
9038 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9039 break;
9040 case PyUnicode_4BYTE_KIND:
9041 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9042 break;
9043 default:
9044 assert(0); result = -2;
9045 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009046 }
9047
9048 if (kind1 != kind)
9049 PyMem_Free(buf1);
9050 if (kind2 != kind)
9051 PyMem_Free(buf2);
9052
9053 return result;
9054}
9055
9056Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01009057_PyUnicode_InsertThousandsGrouping(
9058 PyObject *unicode, Py_ssize_t index,
9059 Py_ssize_t n_buffer,
9060 void *digits, Py_ssize_t n_digits,
9061 Py_ssize_t min_width,
9062 const char *grouping, PyObject *thousands_sep,
9063 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009064{
Victor Stinner41a863c2012-02-24 00:37:51 +01009065 unsigned int kind, thousands_sep_kind;
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009066 char *data, *thousands_sep_data;
Victor Stinner41a863c2012-02-24 00:37:51 +01009067 Py_ssize_t thousands_sep_len;
9068 Py_ssize_t len;
9069
9070 if (unicode != NULL) {
9071 kind = PyUnicode_KIND(unicode);
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009072 data = (char *) PyUnicode_DATA(unicode) + index * kind;
Victor Stinner41a863c2012-02-24 00:37:51 +01009073 }
9074 else {
9075 kind = PyUnicode_1BYTE_KIND;
9076 data = NULL;
9077 }
9078 thousands_sep_kind = PyUnicode_KIND(thousands_sep);
9079 thousands_sep_data = PyUnicode_DATA(thousands_sep);
9080 thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9081 if (unicode != NULL && thousands_sep_kind != kind) {
Victor Stinner90f50d42012-02-24 01:44:47 +01009082 if (thousands_sep_kind < kind) {
9083 thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind);
9084 if (!thousands_sep_data)
9085 return -1;
9086 }
9087 else {
9088 data = _PyUnicode_AsKind(unicode, thousands_sep_kind);
9089 if (!data)
9090 return -1;
9091 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009092 }
9093
Benjamin Petersonead6b532011-12-20 17:23:42 -06009094 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009095 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009096 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
Victor Stinner41a863c2012-02-24 00:37:51 +01009097 len = asciilib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009098 (Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009099 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009100 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009101 else
Victor Stinner41a863c2012-02-24 00:37:51 +01009102 len = ucs1lib_InsertThousandsGrouping(
Victor Stinnerc3cec782011-10-05 21:24:08 +02009103 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009104 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009105 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009106 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009107 case PyUnicode_2BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009108 len = ucs2lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009109 (Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009110 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009111 (Py_UCS2 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009112 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009113 case PyUnicode_4BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009114 len = ucs4lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009115 (Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009116 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009117 (Py_UCS4 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009118 break;
9119 default:
9120 assert(0);
9121 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009122 }
Victor Stinner90f50d42012-02-24 01:44:47 +01009123 if (unicode != NULL && thousands_sep_kind != kind) {
9124 if (thousands_sep_kind < kind)
9125 PyMem_Free(thousands_sep_data);
9126 else
9127 PyMem_Free(data);
9128 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009129 if (unicode == NULL) {
9130 *maxchar = 127;
9131 if (len != n_digits) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009132 *maxchar = Py_MAX(*maxchar,
Victor Stinnere6abb482012-05-02 01:15:40 +02009133 PyUnicode_MAX_CHAR_VALUE(thousands_sep));
Victor Stinner41a863c2012-02-24 00:37:51 +01009134 }
9135 }
9136 return len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009137}
9138
9139
Thomas Wouters477c8d52006-05-27 19:21:47 +00009140/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009141#define ADJUST_INDICES(start, end, len) \
9142 if (end > len) \
9143 end = len; \
9144 else if (end < 0) { \
9145 end += len; \
9146 if (end < 0) \
9147 end = 0; \
9148 } \
9149 if (start < 0) { \
9150 start += len; \
9151 if (start < 0) \
9152 start = 0; \
9153 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009154
Alexander Belopolsky40018472011-02-26 01:02:56 +00009155Py_ssize_t
9156PyUnicode_Count(PyObject *str,
9157 PyObject *substr,
9158 Py_ssize_t start,
9159 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009160{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009161 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009162 PyObject* str_obj;
9163 PyObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009164 int kind1, kind2, kind;
9165 void *buf1 = NULL, *buf2 = NULL;
9166 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009167
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009168 str_obj = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009169 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00009170 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009171 sub_obj = PyUnicode_FromObject(substr);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009172 if (!sub_obj) {
9173 Py_DECREF(str_obj);
9174 return -1;
9175 }
Benjamin Peterson4c13a4a2012-01-02 09:07:38 -06009176 if (PyUnicode_READY(sub_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
Benjamin Peterson5e458f52012-01-02 10:12:13 -06009177 Py_DECREF(sub_obj);
Benjamin Peterson29060642009-01-31 22:14:21 +00009178 Py_DECREF(str_obj);
9179 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009180 }
Tim Petersced69f82003-09-16 20:30:58 +00009181
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009182 kind1 = PyUnicode_KIND(str_obj);
9183 kind2 = PyUnicode_KIND(sub_obj);
Antoine Pitroue45c0c52012-05-12 15:49:07 +02009184 kind = kind1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009185 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009186 buf2 = PyUnicode_DATA(sub_obj);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -05009187 if (kind2 != kind) {
Antoine Pitrou758153b2012-05-12 15:51:51 +02009188 if (kind2 > kind) {
9189 Py_DECREF(sub_obj);
9190 Py_DECREF(str_obj);
Antoine Pitroue45c0c52012-05-12 15:49:07 +02009191 return 0;
Antoine Pitrou758153b2012-05-12 15:51:51 +02009192 }
Victor Stinner7931d9a2011-11-04 00:22:48 +01009193 buf2 = _PyUnicode_AsKind(sub_obj, kind);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -05009194 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009195 if (!buf2)
9196 goto onError;
9197 len1 = PyUnicode_GET_LENGTH(str_obj);
9198 len2 = PyUnicode_GET_LENGTH(sub_obj);
9199
9200 ADJUST_INDICES(start, end, len1);
Benjamin Petersonead6b532011-12-20 17:23:42 -06009201 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009202 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009203 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
9204 result = asciilib_count(
9205 ((Py_UCS1*)buf1) + start, end - start,
9206 buf2, len2, PY_SSIZE_T_MAX
9207 );
9208 else
9209 result = ucs1lib_count(
9210 ((Py_UCS1*)buf1) + start, end - start,
9211 buf2, len2, PY_SSIZE_T_MAX
9212 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009213 break;
9214 case PyUnicode_2BYTE_KIND:
9215 result = ucs2lib_count(
9216 ((Py_UCS2*)buf1) + start, end - start,
9217 buf2, len2, PY_SSIZE_T_MAX
9218 );
9219 break;
9220 case PyUnicode_4BYTE_KIND:
9221 result = ucs4lib_count(
9222 ((Py_UCS4*)buf1) + start, end - start,
9223 buf2, len2, PY_SSIZE_T_MAX
9224 );
9225 break;
9226 default:
9227 assert(0); result = 0;
9228 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009229
9230 Py_DECREF(sub_obj);
9231 Py_DECREF(str_obj);
9232
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009233 if (kind2 != kind)
9234 PyMem_Free(buf2);
9235
Guido van Rossumd57fd912000-03-10 22:53:23 +00009236 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009237 onError:
9238 Py_DECREF(sub_obj);
9239 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009240 if (kind2 != kind && buf2)
9241 PyMem_Free(buf2);
9242 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009243}
9244
Alexander Belopolsky40018472011-02-26 01:02:56 +00009245Py_ssize_t
9246PyUnicode_Find(PyObject *str,
9247 PyObject *sub,
9248 Py_ssize_t start,
9249 Py_ssize_t end,
9250 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009251{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009252 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009253
Guido van Rossumd57fd912000-03-10 22:53:23 +00009254 str = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009255 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00009256 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009257 sub = PyUnicode_FromObject(sub);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009258 if (!sub) {
9259 Py_DECREF(str);
9260 return -2;
9261 }
9262 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
9263 Py_DECREF(sub);
Benjamin Peterson29060642009-01-31 22:14:21 +00009264 Py_DECREF(str);
9265 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009266 }
Tim Petersced69f82003-09-16 20:30:58 +00009267
Victor Stinner794d5672011-10-10 03:21:36 +02009268 result = any_find_slice(direction,
9269 str, sub, start, end
9270 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00009271
Guido van Rossumd57fd912000-03-10 22:53:23 +00009272 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009273 Py_DECREF(sub);
9274
Guido van Rossumd57fd912000-03-10 22:53:23 +00009275 return result;
9276}
9277
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009278Py_ssize_t
9279PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9280 Py_ssize_t start, Py_ssize_t end,
9281 int direction)
9282{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009283 int kind;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009284 Py_ssize_t result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009285 if (PyUnicode_READY(str) == -1)
9286 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02009287 if (start < 0 || end < 0) {
9288 PyErr_SetString(PyExc_IndexError, "string index out of range");
9289 return -2;
9290 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009291 if (end > PyUnicode_GET_LENGTH(str))
9292 end = PyUnicode_GET_LENGTH(str);
9293 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009294 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9295 kind, end-start, ch, direction);
9296 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009297 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009298 else
9299 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009300}
9301
Alexander Belopolsky40018472011-02-26 01:02:56 +00009302static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009303tailmatch(PyObject *self,
9304 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009305 Py_ssize_t start,
9306 Py_ssize_t end,
9307 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009308{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009309 int kind_self;
9310 int kind_sub;
9311 void *data_self;
9312 void *data_sub;
9313 Py_ssize_t offset;
9314 Py_ssize_t i;
9315 Py_ssize_t end_sub;
9316
9317 if (PyUnicode_READY(self) == -1 ||
9318 PyUnicode_READY(substring) == -1)
Victor Stinner18aa4472013-01-03 03:18:09 +01009319 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009320
9321 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009322 return 1;
9323
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009324 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9325 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009326 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009327 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009328
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009329 kind_self = PyUnicode_KIND(self);
9330 data_self = PyUnicode_DATA(self);
9331 kind_sub = PyUnicode_KIND(substring);
9332 data_sub = PyUnicode_DATA(substring);
9333 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9334
9335 if (direction > 0)
9336 offset = end;
9337 else
9338 offset = start;
9339
9340 if (PyUnicode_READ(kind_self, data_self, offset) ==
9341 PyUnicode_READ(kind_sub, data_sub, 0) &&
9342 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9343 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9344 /* If both are of the same kind, memcmp is sufficient */
9345 if (kind_self == kind_sub) {
9346 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009347 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009348 data_sub,
9349 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009350 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009351 }
9352 /* otherwise we have to compare each character by first accesing it */
9353 else {
9354 /* We do not need to compare 0 and len(substring)-1 because
9355 the if statement above ensured already that they are equal
9356 when we end up here. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009357 for (i = 1; i < end_sub; ++i) {
9358 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9359 PyUnicode_READ(kind_sub, data_sub, i))
9360 return 0;
9361 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009362 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009363 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009364 }
9365
9366 return 0;
9367}
9368
Alexander Belopolsky40018472011-02-26 01:02:56 +00009369Py_ssize_t
9370PyUnicode_Tailmatch(PyObject *str,
9371 PyObject *substr,
9372 Py_ssize_t start,
9373 Py_ssize_t end,
9374 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009375{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009376 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009377
Guido van Rossumd57fd912000-03-10 22:53:23 +00009378 str = PyUnicode_FromObject(str);
9379 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009380 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009381 substr = PyUnicode_FromObject(substr);
9382 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009383 Py_DECREF(str);
9384 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009385 }
Tim Petersced69f82003-09-16 20:30:58 +00009386
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009387 result = tailmatch(str, substr,
Benjamin Peterson29060642009-01-31 22:14:21 +00009388 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009389 Py_DECREF(str);
9390 Py_DECREF(substr);
9391 return result;
9392}
9393
Guido van Rossumd57fd912000-03-10 22:53:23 +00009394/* Apply fixfct filter to the Unicode object self and return a
9395 reference to the modified object */
9396
Alexander Belopolsky40018472011-02-26 01:02:56 +00009397static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009398fixup(PyObject *self,
9399 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009400{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009401 PyObject *u;
9402 Py_UCS4 maxchar_old, maxchar_new = 0;
Victor Stinnereaab6042011-12-11 22:22:39 +01009403 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009404
Victor Stinnerbf6e5602011-12-12 01:53:47 +01009405 u = _PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009406 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009407 return NULL;
Victor Stinner87af4f22011-11-21 23:03:47 +01009408 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009409
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009410 /* fix functions return the new maximum character in a string,
9411 if the kind of the resulting unicode object does not change,
9412 everything is fine. Otherwise we need to change the string kind
9413 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009414 maxchar_new = fixfct(u);
Victor Stinnereaab6042011-12-11 22:22:39 +01009415
9416 if (maxchar_new == 0) {
9417 /* no changes */;
9418 if (PyUnicode_CheckExact(self)) {
9419 Py_DECREF(u);
9420 Py_INCREF(self);
9421 return self;
9422 }
9423 else
9424 return u;
9425 }
9426
Victor Stinnere6abb482012-05-02 01:15:40 +02009427 maxchar_new = align_maxchar(maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009428
Victor Stinnereaab6042011-12-11 22:22:39 +01009429 if (maxchar_new == maxchar_old)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009430 return u;
Victor Stinnereaab6042011-12-11 22:22:39 +01009431
9432 /* In case the maximum character changed, we need to
9433 convert the string to the new category. */
9434 v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
9435 if (v == NULL) {
9436 Py_DECREF(u);
9437 return NULL;
9438 }
9439 if (maxchar_new > maxchar_old) {
9440 /* If the maxchar increased so that the kind changed, not all
9441 characters are representable anymore and we need to fix the
9442 string again. This only happens in very few cases. */
Victor Stinnerd3f08822012-05-29 12:57:52 +02009443 _PyUnicode_FastCopyCharacters(v, 0,
9444 self, 0, PyUnicode_GET_LENGTH(self));
Victor Stinnereaab6042011-12-11 22:22:39 +01009445 maxchar_old = fixfct(v);
9446 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009447 }
9448 else {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009449 _PyUnicode_FastCopyCharacters(v, 0,
9450 u, 0, PyUnicode_GET_LENGTH(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009451 }
Victor Stinnereaab6042011-12-11 22:22:39 +01009452 Py_DECREF(u);
9453 assert(_PyUnicode_CheckConsistency(v, 1));
9454 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009455}
9456
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009457static PyObject *
9458ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009459{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009460 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9461 char *resdata, *data = PyUnicode_DATA(self);
9462 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009463
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009464 res = PyUnicode_New(len, 127);
9465 if (res == NULL)
9466 return NULL;
9467 resdata = PyUnicode_DATA(res);
9468 if (lower)
9469 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009470 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009471 _Py_bytes_upper(resdata, data, len);
9472 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009473}
9474
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009475static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009476handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009477{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009478 Py_ssize_t j;
9479 int final_sigma;
9480 Py_UCS4 c;
9481 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009482
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009483 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9484
9485 where ! is a negation and \p{xxx} is a character with property xxx.
9486 */
9487 for (j = i - 1; j >= 0; j--) {
9488 c = PyUnicode_READ(kind, data, j);
9489 if (!_PyUnicode_IsCaseIgnorable(c))
9490 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009491 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009492 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9493 if (final_sigma) {
9494 for (j = i + 1; j < length; j++) {
9495 c = PyUnicode_READ(kind, data, j);
9496 if (!_PyUnicode_IsCaseIgnorable(c))
9497 break;
9498 }
9499 final_sigma = j == length || !_PyUnicode_IsCased(c);
9500 }
9501 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009502}
9503
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009504static int
9505lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9506 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009507{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009508 /* Obscure special case. */
9509 if (c == 0x3A3) {
9510 mapped[0] = handle_capital_sigma(kind, data, length, i);
9511 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009512 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009513 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009514}
9515
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009516static Py_ssize_t
9517do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009518{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009519 Py_ssize_t i, k = 0;
9520 int n_res, j;
9521 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009522
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009523 c = PyUnicode_READ(kind, data, 0);
9524 n_res = _PyUnicode_ToUpperFull(c, mapped);
9525 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009526 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009527 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009528 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009529 for (i = 1; i < length; i++) {
9530 c = PyUnicode_READ(kind, data, i);
9531 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9532 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009533 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009534 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009535 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009536 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009537 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009538}
9539
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009540static Py_ssize_t
9541do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9542 Py_ssize_t i, k = 0;
9543
9544 for (i = 0; i < length; i++) {
9545 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9546 int n_res, j;
9547 if (Py_UNICODE_ISUPPER(c)) {
9548 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9549 }
9550 else if (Py_UNICODE_ISLOWER(c)) {
9551 n_res = _PyUnicode_ToUpperFull(c, mapped);
9552 }
9553 else {
9554 n_res = 1;
9555 mapped[0] = c;
9556 }
9557 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009558 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009559 res[k++] = mapped[j];
9560 }
9561 }
9562 return k;
9563}
9564
9565static Py_ssize_t
9566do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9567 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009568{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009569 Py_ssize_t i, k = 0;
9570
9571 for (i = 0; i < length; i++) {
9572 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9573 int n_res, j;
9574 if (lower)
9575 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9576 else
9577 n_res = _PyUnicode_ToUpperFull(c, mapped);
9578 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009579 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009580 res[k++] = mapped[j];
9581 }
9582 }
9583 return k;
9584}
9585
9586static Py_ssize_t
9587do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9588{
9589 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9590}
9591
9592static Py_ssize_t
9593do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9594{
9595 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9596}
9597
Benjamin Petersone51757f2012-01-12 21:10:29 -05009598static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009599do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9600{
9601 Py_ssize_t i, k = 0;
9602
9603 for (i = 0; i < length; i++) {
9604 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9605 Py_UCS4 mapped[3];
9606 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9607 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009608 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -05009609 res[k++] = mapped[j];
9610 }
9611 }
9612 return k;
9613}
9614
9615static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009616do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9617{
9618 Py_ssize_t i, k = 0;
9619 int previous_is_cased;
9620
9621 previous_is_cased = 0;
9622 for (i = 0; i < length; i++) {
9623 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9624 Py_UCS4 mapped[3];
9625 int n_res, j;
9626
9627 if (previous_is_cased)
9628 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9629 else
9630 n_res = _PyUnicode_ToTitleFull(c, mapped);
9631
9632 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009633 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -05009634 res[k++] = mapped[j];
9635 }
9636
9637 previous_is_cased = _PyUnicode_IsCased(c);
9638 }
9639 return k;
9640}
9641
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009642static PyObject *
9643case_operation(PyObject *self,
9644 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9645{
9646 PyObject *res = NULL;
9647 Py_ssize_t length, newlength = 0;
9648 int kind, outkind;
9649 void *data, *outdata;
9650 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9651
Benjamin Petersoneea48462012-01-16 14:28:50 -05009652 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009653
9654 kind = PyUnicode_KIND(self);
9655 data = PyUnicode_DATA(self);
9656 length = PyUnicode_GET_LENGTH(self);
Antoine Pitroub6dc9b72014-10-15 23:14:53 +02009657 if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
Benjamin Petersone1bd38c2014-10-15 11:47:36 -04009658 PyErr_SetString(PyExc_OverflowError, "string is too long");
9659 return NULL;
9660 }
Benjamin Peterson1e211ff2014-10-15 12:17:21 -04009661 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009662 if (tmp == NULL)
9663 return PyErr_NoMemory();
9664 newlength = perform(kind, data, length, tmp, &maxchar);
9665 res = PyUnicode_New(newlength, maxchar);
9666 if (res == NULL)
9667 goto leave;
9668 tmpend = tmp + newlength;
9669 outdata = PyUnicode_DATA(res);
9670 outkind = PyUnicode_KIND(res);
9671 switch (outkind) {
9672 case PyUnicode_1BYTE_KIND:
9673 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9674 break;
9675 case PyUnicode_2BYTE_KIND:
9676 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9677 break;
9678 case PyUnicode_4BYTE_KIND:
9679 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9680 break;
9681 default:
9682 assert(0);
9683 break;
9684 }
9685 leave:
9686 PyMem_FREE(tmp);
9687 return res;
9688}
9689
Tim Peters8ce9f162004-08-27 01:49:32 +00009690PyObject *
9691PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009692{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009693 PyObject *sep = NULL;
Victor Stinnerdd077322011-10-07 17:02:31 +02009694 Py_ssize_t seplen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009695 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00009696 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009697 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
9698 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00009699 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009700 Py_ssize_t sz, i, res_offset;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009701 Py_UCS4 maxchar;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009702 Py_UCS4 item_maxchar;
Victor Stinnerdd077322011-10-07 17:02:31 +02009703 int use_memcpy;
9704 unsigned char *res_data = NULL, *sep_data = NULL;
9705 PyObject *last_obj;
9706 unsigned int kind = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009707
Benjamin Peterson9743b2c2014-02-15 13:02:52 -05009708 fseq = PySequence_Fast(seq, "can only join an iterable");
Tim Peters05eba1f2004-08-27 21:32:02 +00009709 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009710 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009711 }
9712
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009713 /* NOTE: the following code can't call back into Python code,
9714 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009715 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009716
Tim Peters05eba1f2004-08-27 21:32:02 +00009717 seqlen = PySequence_Fast_GET_SIZE(fseq);
9718 /* If empty sequence, return u"". */
9719 if (seqlen == 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009720 Py_DECREF(fseq);
Serhiy Storchaka678db842013-01-26 12:16:36 +02009721 _Py_RETURN_UNICODE_EMPTY();
Tim Peters05eba1f2004-08-27 21:32:02 +00009722 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009723
Tim Peters05eba1f2004-08-27 21:32:02 +00009724 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009725 last_obj = NULL;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009726 items = PySequence_Fast_ITEMS(fseq);
Victor Stinneracf47b82011-10-06 12:32:37 +02009727 if (seqlen == 1) {
9728 if (PyUnicode_CheckExact(items[0])) {
9729 res = items[0];
9730 Py_INCREF(res);
9731 Py_DECREF(fseq);
9732 return res;
9733 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009734 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009735 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009736 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009737 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009738 /* Set up sep and seplen */
9739 if (separator == NULL) {
9740 /* fall back to a blank space separator */
9741 sep = PyUnicode_FromOrdinal(' ');
9742 if (!sep)
9743 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009744 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009745 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009746 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009747 else {
9748 if (!PyUnicode_Check(separator)) {
9749 PyErr_Format(PyExc_TypeError,
9750 "separator: expected str instance,"
9751 " %.80s found",
9752 Py_TYPE(separator)->tp_name);
9753 goto onError;
9754 }
9755 if (PyUnicode_READY(separator))
9756 goto onError;
9757 sep = separator;
9758 seplen = PyUnicode_GET_LENGTH(separator);
9759 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9760 /* inc refcount to keep this code path symmetric with the
9761 above case of a blank separator */
9762 Py_INCREF(sep);
9763 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009764 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009765 }
9766
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009767 /* There are at least two things to join, or else we have a subclass
9768 * of str in the sequence.
9769 * Do a pre-pass to figure out the total amount of space we'll
9770 * need (sz), and see whether all argument are strings.
9771 */
9772 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009773#ifdef Py_DEBUG
9774 use_memcpy = 0;
9775#else
9776 use_memcpy = 1;
9777#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009778 for (i = 0; i < seqlen; i++) {
9779 const Py_ssize_t old_sz = sz;
9780 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009781 if (!PyUnicode_Check(item)) {
9782 PyErr_Format(PyExc_TypeError,
Victor Stinnera33bce02014-07-04 22:47:46 +02009783 "sequence item %zd: expected str instance,"
Benjamin Peterson29060642009-01-31 22:14:21 +00009784 " %.80s found",
9785 i, Py_TYPE(item)->tp_name);
9786 goto onError;
9787 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009788 if (PyUnicode_READY(item) == -1)
9789 goto onError;
9790 sz += PyUnicode_GET_LENGTH(item);
9791 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Benjamin Peterson7e303732013-06-10 09:19:46 -07009792 maxchar = Py_MAX(maxchar, item_maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009793 if (i != 0)
9794 sz += seplen;
9795 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9796 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009797 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009798 goto onError;
9799 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009800 if (use_memcpy && last_obj != NULL) {
9801 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9802 use_memcpy = 0;
9803 }
9804 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009805 }
Tim Petersced69f82003-09-16 20:30:58 +00009806
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009807 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009808 if (res == NULL)
9809 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009810
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009811 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009812#ifdef Py_DEBUG
9813 use_memcpy = 0;
9814#else
9815 if (use_memcpy) {
9816 res_data = PyUnicode_1BYTE_DATA(res);
9817 kind = PyUnicode_KIND(res);
9818 if (seplen != 0)
9819 sep_data = PyUnicode_1BYTE_DATA(sep);
9820 }
9821#endif
Victor Stinner4560f9c2013-04-14 18:56:46 +02009822 if (use_memcpy) {
9823 for (i = 0; i < seqlen; ++i) {
9824 Py_ssize_t itemlen;
9825 item = items[i];
9826
9827 /* Copy item, and maybe the separator. */
9828 if (i && seplen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009829 Py_MEMCPY(res_data,
9830 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009831 kind * seplen);
9832 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009833 }
Victor Stinner4560f9c2013-04-14 18:56:46 +02009834
9835 itemlen = PyUnicode_GET_LENGTH(item);
9836 if (itemlen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009837 Py_MEMCPY(res_data,
9838 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009839 kind * itemlen);
9840 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009841 }
Victor Stinner4560f9c2013-04-14 18:56:46 +02009842 }
9843 assert(res_data == PyUnicode_1BYTE_DATA(res)
9844 + kind * PyUnicode_GET_LENGTH(res));
9845 }
9846 else {
9847 for (i = 0, res_offset = 0; i < seqlen; ++i) {
9848 Py_ssize_t itemlen;
9849 item = items[i];
9850
9851 /* Copy item, and maybe the separator. */
9852 if (i && seplen != 0) {
9853 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
9854 res_offset += seplen;
9855 }
9856
9857 itemlen = PyUnicode_GET_LENGTH(item);
9858 if (itemlen != 0) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009859 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +02009860 res_offset += itemlen;
9861 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009862 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009863 assert(res_offset == PyUnicode_GET_LENGTH(res));
Victor Stinner4560f9c2013-04-14 18:56:46 +02009864 }
Tim Peters8ce9f162004-08-27 01:49:32 +00009865
Tim Peters05eba1f2004-08-27 21:32:02 +00009866 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009867 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009868 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009869 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009870
Benjamin Peterson29060642009-01-31 22:14:21 +00009871 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00009872 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009873 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00009874 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009875 return NULL;
9876}
9877
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009878#define FILL(kind, data, value, start, length) \
9879 do { \
9880 Py_ssize_t i_ = 0; \
9881 assert(kind != PyUnicode_WCHAR_KIND); \
9882 switch ((kind)) { \
9883 case PyUnicode_1BYTE_KIND: { \
9884 unsigned char * to_ = (unsigned char *)((data)) + (start); \
Victor Stinnerf2c76aa2012-05-03 13:10:40 +02009885 memset(to_, (unsigned char)value, (length)); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009886 break; \
9887 } \
9888 case PyUnicode_2BYTE_KIND: { \
9889 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
9890 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9891 break; \
9892 } \
Benjamin Petersone157cf12012-01-01 15:56:20 -06009893 case PyUnicode_4BYTE_KIND: { \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009894 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9895 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9896 break; \
9897 } \
Serhiy Storchaka133b11b2014-12-01 18:56:28 +02009898 default: assert(0); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009899 } \
9900 } while (0)
9901
Victor Stinnerd3f08822012-05-29 12:57:52 +02009902void
9903_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9904 Py_UCS4 fill_char)
9905{
9906 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
9907 const void *data = PyUnicode_DATA(unicode);
9908 assert(PyUnicode_IS_READY(unicode));
9909 assert(unicode_modifiable(unicode));
9910 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
9911 assert(start >= 0);
9912 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
9913 FILL(kind, data, fill_char, start, length);
9914}
9915
Victor Stinner3fe55312012-01-04 00:33:50 +01009916Py_ssize_t
9917PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9918 Py_UCS4 fill_char)
9919{
9920 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +01009921
9922 if (!PyUnicode_Check(unicode)) {
9923 PyErr_BadInternalCall();
9924 return -1;
9925 }
9926 if (PyUnicode_READY(unicode) == -1)
9927 return -1;
9928 if (unicode_check_modifiable(unicode))
9929 return -1;
9930
Victor Stinnerd3f08822012-05-29 12:57:52 +02009931 if (start < 0) {
9932 PyErr_SetString(PyExc_IndexError, "string index out of range");
9933 return -1;
9934 }
Victor Stinner3fe55312012-01-04 00:33:50 +01009935 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
9936 PyErr_SetString(PyExc_ValueError,
9937 "fill character is bigger than "
9938 "the string maximum character");
9939 return -1;
9940 }
9941
9942 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
9943 length = Py_MIN(maxlen, length);
9944 if (length <= 0)
9945 return 0;
9946
Victor Stinnerd3f08822012-05-29 12:57:52 +02009947 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +01009948 return length;
9949}
9950
Victor Stinner9310abb2011-10-05 00:59:23 +02009951static PyObject *
9952pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009953 Py_ssize_t left,
9954 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009955 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009956{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009957 PyObject *u;
9958 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009959 int kind;
9960 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009961
9962 if (left < 0)
9963 left = 0;
9964 if (right < 0)
9965 right = 0;
9966
Victor Stinnerc4b49542011-12-11 22:44:26 +01009967 if (left == 0 && right == 0)
9968 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009969
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009970 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9971 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00009972 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9973 return NULL;
9974 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009975 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Benjamin Peterson7e303732013-06-10 09:19:46 -07009976 maxchar = Py_MAX(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009977 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009978 if (!u)
9979 return NULL;
9980
9981 kind = PyUnicode_KIND(u);
9982 data = PyUnicode_DATA(u);
9983 if (left)
9984 FILL(kind, data, fill, 0, left);
9985 if (right)
9986 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +02009987 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009988 assert(_PyUnicode_CheckConsistency(u, 1));
9989 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009990}
9991
Alexander Belopolsky40018472011-02-26 01:02:56 +00009992PyObject *
9993PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009994{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009995 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009996
9997 string = PyUnicode_FromObject(string);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009998 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009999 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060010000 if (PyUnicode_READY(string) == -1) {
10001 Py_DECREF(string);
10002 return NULL;
10003 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010004
Benjamin Petersonead6b532011-12-20 17:23:42 -060010005 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010006 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010007 if (PyUnicode_IS_ASCII(string))
10008 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010009 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010010 PyUnicode_GET_LENGTH(string), keepends);
10011 else
10012 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010013 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010014 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010015 break;
10016 case PyUnicode_2BYTE_KIND:
10017 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010018 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010019 PyUnicode_GET_LENGTH(string), keepends);
10020 break;
10021 case PyUnicode_4BYTE_KIND:
10022 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010023 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010024 PyUnicode_GET_LENGTH(string), keepends);
10025 break;
10026 default:
10027 assert(0);
10028 list = 0;
10029 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010030 Py_DECREF(string);
10031 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010032}
10033
Alexander Belopolsky40018472011-02-26 01:02:56 +000010034static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010035split(PyObject *self,
10036 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010037 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010038{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010039 int kind1, kind2, kind;
10040 void *buf1, *buf2;
10041 Py_ssize_t len1, len2;
10042 PyObject* out;
10043
Guido van Rossumd57fd912000-03-10 22:53:23 +000010044 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010045 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010046
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010047 if (PyUnicode_READY(self) == -1)
10048 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010049
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010050 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010051 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010052 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010053 if (PyUnicode_IS_ASCII(self))
10054 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010055 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010056 PyUnicode_GET_LENGTH(self), maxcount
10057 );
10058 else
10059 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010060 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010061 PyUnicode_GET_LENGTH(self), maxcount
10062 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010063 case PyUnicode_2BYTE_KIND:
10064 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010065 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010066 PyUnicode_GET_LENGTH(self), maxcount
10067 );
10068 case PyUnicode_4BYTE_KIND:
10069 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010070 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010071 PyUnicode_GET_LENGTH(self), maxcount
10072 );
10073 default:
10074 assert(0);
10075 return NULL;
10076 }
10077
10078 if (PyUnicode_READY(substring) == -1)
10079 return NULL;
10080
10081 kind1 = PyUnicode_KIND(self);
10082 kind2 = PyUnicode_KIND(substring);
10083 kind = kind1 > kind2 ? kind1 : kind2;
10084 buf1 = PyUnicode_DATA(self);
10085 buf2 = PyUnicode_DATA(substring);
10086 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010087 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010088 if (!buf1)
10089 return NULL;
10090 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010091 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010092 if (!buf2) {
10093 if (kind1 != kind) PyMem_Free(buf1);
10094 return NULL;
10095 }
10096 len1 = PyUnicode_GET_LENGTH(self);
10097 len2 = PyUnicode_GET_LENGTH(substring);
10098
Benjamin Petersonead6b532011-12-20 17:23:42 -060010099 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010100 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010101 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10102 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010103 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010104 else
10105 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010106 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010107 break;
10108 case PyUnicode_2BYTE_KIND:
10109 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010110 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010111 break;
10112 case PyUnicode_4BYTE_KIND:
10113 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010114 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010115 break;
10116 default:
10117 out = NULL;
10118 }
10119 if (kind1 != kind)
10120 PyMem_Free(buf1);
10121 if (kind2 != kind)
10122 PyMem_Free(buf2);
10123 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010124}
10125
Alexander Belopolsky40018472011-02-26 01:02:56 +000010126static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010127rsplit(PyObject *self,
10128 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010129 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010130{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010131 int kind1, kind2, kind;
10132 void *buf1, *buf2;
10133 Py_ssize_t len1, len2;
10134 PyObject* out;
10135
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010136 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010137 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010138
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010139 if (PyUnicode_READY(self) == -1)
10140 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010141
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010142 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010143 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010144 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010145 if (PyUnicode_IS_ASCII(self))
10146 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010147 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010148 PyUnicode_GET_LENGTH(self), maxcount
10149 );
10150 else
10151 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010152 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010153 PyUnicode_GET_LENGTH(self), maxcount
10154 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010155 case PyUnicode_2BYTE_KIND:
10156 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010157 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010158 PyUnicode_GET_LENGTH(self), maxcount
10159 );
10160 case PyUnicode_4BYTE_KIND:
10161 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010162 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010163 PyUnicode_GET_LENGTH(self), maxcount
10164 );
10165 default:
10166 assert(0);
10167 return NULL;
10168 }
10169
10170 if (PyUnicode_READY(substring) == -1)
10171 return NULL;
10172
10173 kind1 = PyUnicode_KIND(self);
10174 kind2 = PyUnicode_KIND(substring);
10175 kind = kind1 > kind2 ? kind1 : kind2;
10176 buf1 = PyUnicode_DATA(self);
10177 buf2 = PyUnicode_DATA(substring);
10178 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010179 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010180 if (!buf1)
10181 return NULL;
10182 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010183 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010184 if (!buf2) {
10185 if (kind1 != kind) PyMem_Free(buf1);
10186 return NULL;
10187 }
10188 len1 = PyUnicode_GET_LENGTH(self);
10189 len2 = PyUnicode_GET_LENGTH(substring);
10190
Benjamin Petersonead6b532011-12-20 17:23:42 -060010191 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010192 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010193 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10194 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010195 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010196 else
10197 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010198 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010199 break;
10200 case PyUnicode_2BYTE_KIND:
10201 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010202 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010203 break;
10204 case PyUnicode_4BYTE_KIND:
10205 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010206 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010207 break;
10208 default:
10209 out = NULL;
10210 }
10211 if (kind1 != kind)
10212 PyMem_Free(buf1);
10213 if (kind2 != kind)
10214 PyMem_Free(buf2);
10215 return out;
10216}
10217
10218static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010219anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10220 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010221{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010222 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010223 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010224 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10225 return asciilib_find(buf1, len1, buf2, len2, offset);
10226 else
10227 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010228 case PyUnicode_2BYTE_KIND:
10229 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10230 case PyUnicode_4BYTE_KIND:
10231 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10232 }
10233 assert(0);
10234 return -1;
10235}
10236
10237static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010238anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10239 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010240{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010241 switch (kind) {
10242 case PyUnicode_1BYTE_KIND:
10243 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10244 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10245 else
10246 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10247 case PyUnicode_2BYTE_KIND:
10248 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10249 case PyUnicode_4BYTE_KIND:
10250 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10251 }
10252 assert(0);
10253 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010254}
10255
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010256static void
10257replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10258 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10259{
10260 int kind = PyUnicode_KIND(u);
10261 void *data = PyUnicode_DATA(u);
10262 Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10263 if (kind == PyUnicode_1BYTE_KIND) {
10264 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10265 (Py_UCS1 *)data + len,
10266 u1, u2, maxcount);
10267 }
10268 else if (kind == PyUnicode_2BYTE_KIND) {
10269 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10270 (Py_UCS2 *)data + len,
10271 u1, u2, maxcount);
10272 }
10273 else {
10274 assert(kind == PyUnicode_4BYTE_KIND);
10275 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10276 (Py_UCS4 *)data + len,
10277 u1, u2, maxcount);
10278 }
10279}
10280
Alexander Belopolsky40018472011-02-26 01:02:56 +000010281static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010282replace(PyObject *self, PyObject *str1,
10283 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010284{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010285 PyObject *u;
10286 char *sbuf = PyUnicode_DATA(self);
10287 char *buf1 = PyUnicode_DATA(str1);
10288 char *buf2 = PyUnicode_DATA(str2);
10289 int srelease = 0, release1 = 0, release2 = 0;
10290 int skind = PyUnicode_KIND(self);
10291 int kind1 = PyUnicode_KIND(str1);
10292 int kind2 = PyUnicode_KIND(str2);
10293 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10294 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10295 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010296 int mayshrink;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010297 Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010298
10299 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010300 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010301 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010302 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010303
Victor Stinner59de0ee2011-10-07 10:01:28 +020010304 if (str1 == str2)
10305 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010306
Victor Stinner49a0a212011-10-12 23:46:10 +020010307 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010308 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10309 if (maxchar < maxchar_str1)
10310 /* substring too wide to be present */
10311 goto nothing;
Victor Stinner49a0a212011-10-12 23:46:10 +020010312 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10313 /* Replacing str1 with str2 may cause a maxchar reduction in the
10314 result string. */
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010315 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010316 maxchar = Py_MAX(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010317
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010318 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010319 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010320 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010321 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010322 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010323 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010324 Py_UCS4 u1, u2;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010325 Py_ssize_t pos;
Victor Stinnerf6441102011-12-18 02:43:08 +010010326
Victor Stinner69ed0f42013-04-09 21:48:24 +020010327 u1 = PyUnicode_READ(kind1, buf1, 0);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010328 pos = findchar(sbuf, skind, slen, u1, 1);
Victor Stinnerf6441102011-12-18 02:43:08 +010010329 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010330 goto nothing;
Victor Stinner69ed0f42013-04-09 21:48:24 +020010331 u2 = PyUnicode_READ(kind2, buf2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010332 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010333 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010334 goto error;
Victor Stinnerf6441102011-12-18 02:43:08 +010010335
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010336 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10337 replace_1char_inplace(u, pos, u1, u2, maxcount);
Victor Stinner49a0a212011-10-12 23:46:10 +020010338 }
10339 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010340 int rkind = skind;
10341 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010342 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010343
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010344 if (kind1 < rkind) {
10345 /* widen substring */
10346 buf1 = _PyUnicode_AsKind(str1, rkind);
10347 if (!buf1) goto error;
10348 release1 = 1;
10349 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010350 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010351 if (i < 0)
10352 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010353 if (rkind > kind2) {
10354 /* widen replacement */
10355 buf2 = _PyUnicode_AsKind(str2, rkind);
10356 if (!buf2) goto error;
10357 release2 = 1;
10358 }
10359 else if (rkind < kind2) {
10360 /* widen self and buf1 */
10361 rkind = kind2;
10362 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010363 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010364 sbuf = _PyUnicode_AsKind(self, rkind);
10365 if (!sbuf) goto error;
10366 srelease = 1;
10367 buf1 = _PyUnicode_AsKind(str1, rkind);
10368 if (!buf1) goto error;
10369 release1 = 1;
10370 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010371 u = PyUnicode_New(slen, maxchar);
10372 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010373 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010374 assert(PyUnicode_KIND(u) == rkind);
10375 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010376
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010377 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010378 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010379 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010380 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010381 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010382 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010383
10384 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010385 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010386 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010387 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010388 if (i == -1)
10389 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010390 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010391 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010392 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010393 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010394 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010395 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010396 }
10397 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010398 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010399 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010400 int rkind = skind;
10401 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010402
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010403 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010404 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010405 buf1 = _PyUnicode_AsKind(str1, rkind);
10406 if (!buf1) goto error;
10407 release1 = 1;
10408 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010409 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010410 if (n == 0)
10411 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010412 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010413 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010414 buf2 = _PyUnicode_AsKind(str2, rkind);
10415 if (!buf2) goto error;
10416 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010417 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010418 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010419 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010420 rkind = kind2;
10421 sbuf = _PyUnicode_AsKind(self, rkind);
10422 if (!sbuf) goto error;
10423 srelease = 1;
10424 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010425 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010426 buf1 = _PyUnicode_AsKind(str1, rkind);
10427 if (!buf1) goto error;
10428 release1 = 1;
10429 }
10430 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10431 PyUnicode_GET_LENGTH(str1))); */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010432 if (len2 > len1 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010433 PyErr_SetString(PyExc_OverflowError,
10434 "replace string is too long");
10435 goto error;
10436 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010437 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010438 if (new_size == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010439 _Py_INCREF_UNICODE_EMPTY();
10440 if (!unicode_empty)
10441 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010442 u = unicode_empty;
10443 goto done;
10444 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010445 if (new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010446 PyErr_SetString(PyExc_OverflowError,
10447 "replace string is too long");
10448 goto error;
10449 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010450 u = PyUnicode_New(new_size, maxchar);
10451 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010452 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010453 assert(PyUnicode_KIND(u) == rkind);
10454 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010455 ires = i = 0;
10456 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010457 while (n-- > 0) {
10458 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010459 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010460 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010461 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010462 if (j == -1)
10463 break;
10464 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010465 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010466 memcpy(res + rkind * ires,
10467 sbuf + rkind * i,
10468 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010469 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010470 }
10471 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010472 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010473 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010474 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010475 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010476 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010477 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010478 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010479 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010480 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010481 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010482 memcpy(res + rkind * ires,
10483 sbuf + rkind * i,
10484 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010485 }
10486 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010487 /* interleave */
10488 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010489 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010490 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010491 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010492 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010493 if (--n <= 0)
10494 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010495 memcpy(res + rkind * ires,
10496 sbuf + rkind * i,
10497 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010498 ires++;
10499 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010500 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010501 memcpy(res + rkind * ires,
10502 sbuf + rkind * i,
10503 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010504 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010505 }
10506
10507 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010508 unicode_adjust_maxchar(&u);
10509 if (u == NULL)
10510 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010511 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010512
10513 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010514 if (srelease)
10515 PyMem_FREE(sbuf);
10516 if (release1)
10517 PyMem_FREE(buf1);
10518 if (release2)
10519 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010520 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010521 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010522
Benjamin Peterson29060642009-01-31 22:14:21 +000010523 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010524 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010525 if (srelease)
10526 PyMem_FREE(sbuf);
10527 if (release1)
10528 PyMem_FREE(buf1);
10529 if (release2)
10530 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010531 return unicode_result_unchanged(self);
10532
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010533 error:
10534 if (srelease && sbuf)
10535 PyMem_FREE(sbuf);
10536 if (release1 && buf1)
10537 PyMem_FREE(buf1);
10538 if (release2 && buf2)
10539 PyMem_FREE(buf2);
10540 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010541}
10542
10543/* --- Unicode Object Methods --------------------------------------------- */
10544
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010545PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010546 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010547\n\
10548Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010549characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010550
10551static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010552unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010553{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010554 if (PyUnicode_READY(self) == -1)
10555 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010556 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010557}
10558
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010559PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010560 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010561\n\
10562Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +000010563have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010564
10565static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010566unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010567{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010568 if (PyUnicode_READY(self) == -1)
10569 return NULL;
10570 if (PyUnicode_GET_LENGTH(self) == 0)
10571 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010572 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010573}
10574
Benjamin Petersond5890c82012-01-14 13:23:30 -050010575PyDoc_STRVAR(casefold__doc__,
10576 "S.casefold() -> str\n\
10577\n\
10578Return a version of S suitable for caseless comparisons.");
10579
10580static PyObject *
10581unicode_casefold(PyObject *self)
10582{
10583 if (PyUnicode_READY(self) == -1)
10584 return NULL;
10585 if (PyUnicode_IS_ASCII(self))
10586 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010587 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010588}
10589
10590
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010591/* Argument converter. Coerces to a single unicode character */
10592
10593static int
10594convert_uc(PyObject *obj, void *addr)
10595{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010596 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010597 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010598
Benjamin Peterson14339b62009-01-31 16:36:08 +000010599 uniobj = PyUnicode_FromObject(obj);
10600 if (uniobj == NULL) {
10601 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010602 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010603 return 0;
10604 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010605 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010606 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010607 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010608 Py_DECREF(uniobj);
10609 return 0;
10610 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010611 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010612 Py_DECREF(uniobj);
10613 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010614}
10615
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010616PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010617 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010618\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010619Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010620done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010621
10622static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010623unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010624{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010625 Py_ssize_t marg, left;
10626 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010627 Py_UCS4 fillchar = ' ';
10628
Victor Stinnere9a29352011-10-01 02:14:59 +020010629 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010630 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010631
Benjamin Petersonbac79492012-01-14 13:34:47 -050010632 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010633 return NULL;
10634
Victor Stinnerc4b49542011-12-11 22:44:26 +010010635 if (PyUnicode_GET_LENGTH(self) >= width)
10636 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010637
Victor Stinnerc4b49542011-12-11 22:44:26 +010010638 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010639 left = marg / 2 + (marg & width & 1);
10640
Victor Stinner9310abb2011-10-05 00:59:23 +020010641 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010642}
10643
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010644/* This function assumes that str1 and str2 are readied by the caller. */
10645
Marc-André Lemburge5034372000-08-08 08:04:29 +000010646static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010647unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010648{
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010649#define COMPARE(TYPE1, TYPE2) \
10650 do { \
10651 TYPE1* p1 = (TYPE1 *)data1; \
10652 TYPE2* p2 = (TYPE2 *)data2; \
10653 TYPE1* end = p1 + len; \
10654 Py_UCS4 c1, c2; \
10655 for (; p1 != end; p1++, p2++) { \
10656 c1 = *p1; \
10657 c2 = *p2; \
10658 if (c1 != c2) \
10659 return (c1 < c2) ? -1 : 1; \
10660 } \
10661 } \
10662 while (0)
10663
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010664 int kind1, kind2;
10665 void *data1, *data2;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010666 Py_ssize_t len1, len2, len;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010667
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010668 kind1 = PyUnicode_KIND(str1);
10669 kind2 = PyUnicode_KIND(str2);
10670 data1 = PyUnicode_DATA(str1);
10671 data2 = PyUnicode_DATA(str2);
10672 len1 = PyUnicode_GET_LENGTH(str1);
10673 len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner770e19e2012-10-04 22:59:45 +020010674 len = Py_MIN(len1, len2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010675
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010676 switch(kind1) {
10677 case PyUnicode_1BYTE_KIND:
10678 {
10679 switch(kind2) {
10680 case PyUnicode_1BYTE_KIND:
10681 {
10682 int cmp = memcmp(data1, data2, len);
10683 /* normalize result of memcmp() into the range [-1; 1] */
10684 if (cmp < 0)
10685 return -1;
10686 if (cmp > 0)
10687 return 1;
10688 break;
Victor Stinner770e19e2012-10-04 22:59:45 +020010689 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010690 case PyUnicode_2BYTE_KIND:
10691 COMPARE(Py_UCS1, Py_UCS2);
10692 break;
10693 case PyUnicode_4BYTE_KIND:
10694 COMPARE(Py_UCS1, Py_UCS4);
10695 break;
10696 default:
10697 assert(0);
10698 }
10699 break;
10700 }
10701 case PyUnicode_2BYTE_KIND:
10702 {
10703 switch(kind2) {
10704 case PyUnicode_1BYTE_KIND:
10705 COMPARE(Py_UCS2, Py_UCS1);
10706 break;
10707 case PyUnicode_2BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010708 {
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010709 COMPARE(Py_UCS2, Py_UCS2);
10710 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010711 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010712 case PyUnicode_4BYTE_KIND:
10713 COMPARE(Py_UCS2, Py_UCS4);
10714 break;
10715 default:
10716 assert(0);
10717 }
10718 break;
10719 }
10720 case PyUnicode_4BYTE_KIND:
10721 {
10722 switch(kind2) {
10723 case PyUnicode_1BYTE_KIND:
10724 COMPARE(Py_UCS4, Py_UCS1);
10725 break;
10726 case PyUnicode_2BYTE_KIND:
10727 COMPARE(Py_UCS4, Py_UCS2);
10728 break;
10729 case PyUnicode_4BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010730 {
10731#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
10732 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
10733 /* normalize result of wmemcmp() into the range [-1; 1] */
10734 if (cmp < 0)
10735 return -1;
10736 if (cmp > 0)
10737 return 1;
10738#else
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010739 COMPARE(Py_UCS4, Py_UCS4);
Victor Stinnercd777ea2013-04-08 22:43:44 +020010740#endif
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010741 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010742 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010743 default:
10744 assert(0);
10745 }
10746 break;
10747 }
10748 default:
10749 assert(0);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010750 }
10751
Victor Stinner770e19e2012-10-04 22:59:45 +020010752 if (len1 == len2)
10753 return 0;
10754 if (len1 < len2)
10755 return -1;
10756 else
10757 return 1;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010758
10759#undef COMPARE
Marc-André Lemburge5034372000-08-08 08:04:29 +000010760}
10761
Victor Stinnerc8bc5372013-11-04 11:08:10 +010010762Py_LOCAL(int)
Victor Stinnere5567ad2012-10-23 02:48:49 +020010763unicode_compare_eq(PyObject *str1, PyObject *str2)
10764{
10765 int kind;
10766 void *data1, *data2;
10767 Py_ssize_t len;
10768 int cmp;
10769
Victor Stinnere5567ad2012-10-23 02:48:49 +020010770 len = PyUnicode_GET_LENGTH(str1);
10771 if (PyUnicode_GET_LENGTH(str2) != len)
10772 return 0;
10773 kind = PyUnicode_KIND(str1);
10774 if (PyUnicode_KIND(str2) != kind)
10775 return 0;
10776 data1 = PyUnicode_DATA(str1);
10777 data2 = PyUnicode_DATA(str2);
10778
10779 cmp = memcmp(data1, data2, len * kind);
10780 return (cmp == 0);
10781}
10782
10783
Alexander Belopolsky40018472011-02-26 01:02:56 +000010784int
10785PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010786{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010787 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10788 if (PyUnicode_READY(left) == -1 ||
10789 PyUnicode_READY(right) == -1)
10790 return -1;
Victor Stinnerf0c7b2a2013-11-04 11:27:14 +010010791
10792 /* a string is equal to itself */
10793 if (left == right)
10794 return 0;
10795
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010796 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010797 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010798 PyErr_Format(PyExc_TypeError,
10799 "Can't compare %.100s and %.100s",
10800 left->ob_type->tp_name,
10801 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010802 return -1;
10803}
10804
Martin v. Löwis5b222132007-06-10 09:51:05 +000010805int
Victor Stinnerad14ccd2013-11-07 00:46:04 +010010806_PyUnicode_CompareWithId(PyObject *left, _Py_Identifier *right)
10807{
10808 PyObject *right_str = _PyUnicode_FromId(right); /* borrowed */
10809 if (right_str == NULL)
10810 return -1;
10811 return PyUnicode_Compare(left, right_str);
10812}
10813
10814int
Martin v. Löwis5b222132007-06-10 09:51:05 +000010815PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10816{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010817 Py_ssize_t i;
10818 int kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010819 Py_UCS4 chr;
10820
Victor Stinner910337b2011-10-03 03:20:16 +020010821 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010822 if (PyUnicode_READY(uni) == -1)
10823 return -1;
10824 kind = PyUnicode_KIND(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010010825 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnera6b9b072013-10-30 18:27:13 +010010826 const void *data = PyUnicode_1BYTE_DATA(uni);
Victor Stinnere1b15922013-11-03 13:53:12 +010010827 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010010828 size_t len, len2 = strlen(str);
10829 int cmp;
10830
10831 len = Py_MIN(len1, len2);
10832 cmp = memcmp(data, str, len);
Victor Stinner21ea21e2013-11-04 11:28:26 +010010833 if (cmp != 0) {
10834 if (cmp < 0)
10835 return -1;
10836 else
10837 return 1;
10838 }
Victor Stinner602f7cf2013-10-29 23:31:50 +010010839 if (len1 > len2)
10840 return 1; /* uni is longer */
10841 if (len2 > len1)
10842 return -1; /* str is longer */
10843 return 0;
10844 }
10845 else {
10846 void *data = PyUnicode_DATA(uni);
10847 /* Compare Unicode string and source character set string */
10848 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
10849 if (chr != str[i])
10850 return (chr < (unsigned char)(str[i])) ? -1 : 1;
10851 /* This check keeps Python strings that end in '\0' from comparing equal
10852 to C strings identical up to that point. */
10853 if (PyUnicode_GET_LENGTH(uni) != i || chr)
10854 return 1; /* uni is longer */
10855 if (str[i])
10856 return -1; /* str is longer */
10857 return 0;
10858 }
Martin v. Löwis5b222132007-06-10 09:51:05 +000010859}
10860
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010861
Benjamin Peterson29060642009-01-31 22:14:21 +000010862#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000010863 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010864
Alexander Belopolsky40018472011-02-26 01:02:56 +000010865PyObject *
10866PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010867{
10868 int result;
Victor Stinnere5567ad2012-10-23 02:48:49 +020010869 PyObject *v;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010870
Victor Stinnere5567ad2012-10-23 02:48:49 +020010871 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
10872 Py_RETURN_NOTIMPLEMENTED;
10873
10874 if (PyUnicode_READY(left) == -1 ||
10875 PyUnicode_READY(right) == -1)
10876 return NULL;
10877
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010010878 if (left == right) {
10879 switch (op) {
10880 case Py_EQ:
10881 case Py_LE:
10882 case Py_GE:
10883 /* a string is equal to itself */
10884 v = Py_True;
10885 break;
10886 case Py_NE:
10887 case Py_LT:
10888 case Py_GT:
10889 v = Py_False;
10890 break;
10891 default:
10892 PyErr_BadArgument();
10893 return NULL;
10894 }
10895 }
10896 else if (op == Py_EQ || op == Py_NE) {
Victor Stinnere5567ad2012-10-23 02:48:49 +020010897 result = unicode_compare_eq(left, right);
Victor Stinnerc8bc5372013-11-04 11:08:10 +010010898 result ^= (op == Py_NE);
10899 v = TEST_COND(result);
Victor Stinnere5567ad2012-10-23 02:48:49 +020010900 }
10901 else {
Victor Stinner90db9c42012-10-04 21:53:50 +020010902 result = unicode_compare(left, right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010903
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010904 /* Convert the return value to a Boolean */
10905 switch (op) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010906 case Py_LE:
10907 v = TEST_COND(result <= 0);
10908 break;
10909 case Py_GE:
10910 v = TEST_COND(result >= 0);
10911 break;
10912 case Py_LT:
10913 v = TEST_COND(result == -1);
10914 break;
10915 case Py_GT:
10916 v = TEST_COND(result == 1);
10917 break;
10918 default:
10919 PyErr_BadArgument();
10920 return NULL;
10921 }
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010922 }
Victor Stinnere5567ad2012-10-23 02:48:49 +020010923 Py_INCREF(v);
10924 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010925}
10926
Alexander Belopolsky40018472011-02-26 01:02:56 +000010927int
10928PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +000010929{
Thomas Wouters477c8d52006-05-27 19:21:47 +000010930 PyObject *str, *sub;
Victor Stinner77282cb2013-04-14 19:22:47 +020010931 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010932 void *buf1, *buf2;
10933 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010934 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010935
10936 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +000010937 sub = PyUnicode_FromObject(element);
10938 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010939 PyErr_Format(PyExc_TypeError,
10940 "'in <string>' requires string as left operand, not %s",
10941 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010942 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010943 }
10944
Thomas Wouters477c8d52006-05-27 19:21:47 +000010945 str = PyUnicode_FromObject(container);
Benjamin Peterson22a29702012-01-02 09:00:30 -060010946 if (!str) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010947 Py_DECREF(sub);
10948 return -1;
10949 }
10950
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010951 kind1 = PyUnicode_KIND(str);
10952 kind2 = PyUnicode_KIND(sub);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010953 buf1 = PyUnicode_DATA(str);
10954 buf2 = PyUnicode_DATA(sub);
Victor Stinner77282cb2013-04-14 19:22:47 +020010955 if (kind2 != kind1) {
10956 if (kind2 > kind1) {
Antoine Pitrou758153b2012-05-12 15:51:51 +020010957 Py_DECREF(sub);
10958 Py_DECREF(str);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010959 return 0;
Antoine Pitrou758153b2012-05-12 15:51:51 +020010960 }
Victor Stinner77282cb2013-04-14 19:22:47 +020010961 buf2 = _PyUnicode_AsKind(sub, kind1);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010962 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010963 if (!buf2) {
10964 Py_DECREF(sub);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010965 Py_DECREF(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010966 return -1;
10967 }
10968 len1 = PyUnicode_GET_LENGTH(str);
10969 len2 = PyUnicode_GET_LENGTH(sub);
10970
Victor Stinner77282cb2013-04-14 19:22:47 +020010971 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010972 case PyUnicode_1BYTE_KIND:
10973 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10974 break;
10975 case PyUnicode_2BYTE_KIND:
10976 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10977 break;
10978 case PyUnicode_4BYTE_KIND:
10979 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
10980 break;
10981 default:
10982 result = -1;
10983 assert(0);
10984 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010985
10986 Py_DECREF(str);
10987 Py_DECREF(sub);
10988
Victor Stinner77282cb2013-04-14 19:22:47 +020010989 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010990 PyMem_Free(buf2);
10991
Guido van Rossum403d68b2000-03-13 15:55:09 +000010992 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010993}
10994
Guido van Rossumd57fd912000-03-10 22:53:23 +000010995/* Concat to string or Unicode object giving a new Unicode object. */
10996
Alexander Belopolsky40018472011-02-26 01:02:56 +000010997PyObject *
10998PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010999{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011000 PyObject *u = NULL, *v = NULL, *w;
Victor Stinner127226b2011-10-13 01:12:34 +020011001 Py_UCS4 maxchar, maxchar2;
Victor Stinner488fa492011-12-12 00:01:39 +010011002 Py_ssize_t u_len, v_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011003
11004 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011005 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011006 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000011007 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011008 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011009 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000011010 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011011
11012 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +020011013 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011014 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011015 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011016 }
Victor Stinnera464fc12011-10-02 20:39:30 +020011017 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011018 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011019 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011020 }
11021
Victor Stinner488fa492011-12-12 00:01:39 +010011022 u_len = PyUnicode_GET_LENGTH(u);
11023 v_len = PyUnicode_GET_LENGTH(v);
11024 if (u_len > PY_SSIZE_T_MAX - v_len) {
11025 PyErr_SetString(PyExc_OverflowError,
11026 "strings are too large to concat");
11027 goto onError;
11028 }
11029 new_len = u_len + v_len;
11030
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011031 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinner127226b2011-10-13 01:12:34 +020011032 maxchar2 = PyUnicode_MAX_CHAR_VALUE(v);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011033 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011034
Guido van Rossumd57fd912000-03-10 22:53:23 +000011035 /* Concat the two Unicode strings */
Victor Stinner488fa492011-12-12 00:01:39 +010011036 w = PyUnicode_New(new_len, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011037 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000011038 goto onError;
Victor Stinnerd3f08822012-05-29 12:57:52 +020011039 _PyUnicode_FastCopyCharacters(w, 0, u, 0, u_len);
11040 _PyUnicode_FastCopyCharacters(w, u_len, v, 0, v_len);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011041 Py_DECREF(u);
11042 Py_DECREF(v);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011043 assert(_PyUnicode_CheckConsistency(w, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011044 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011045
Benjamin Peterson29060642009-01-31 22:14:21 +000011046 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000011047 Py_XDECREF(u);
11048 Py_XDECREF(v);
11049 return NULL;
11050}
11051
Walter Dörwald1ab83302007-05-18 17:15:44 +000011052void
Victor Stinner23e56682011-10-03 03:54:37 +020011053PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000011054{
Victor Stinner23e56682011-10-03 03:54:37 +020011055 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010011056 Py_UCS4 maxchar, maxchar2;
11057 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020011058
11059 if (p_left == NULL) {
11060 if (!PyErr_Occurred())
11061 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000011062 return;
11063 }
Victor Stinner23e56682011-10-03 03:54:37 +020011064 left = *p_left;
Victor Stinnerf0335102013-04-14 19:13:03 +020011065 if (right == NULL || left == NULL
11066 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
Victor Stinner23e56682011-10-03 03:54:37 +020011067 if (!PyErr_Occurred())
11068 PyErr_BadInternalCall();
11069 goto error;
11070 }
11071
Benjamin Petersonbac79492012-01-14 13:34:47 -050011072 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011073 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050011074 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011075 goto error;
11076
Victor Stinner488fa492011-12-12 00:01:39 +010011077 /* Shortcuts */
11078 if (left == unicode_empty) {
11079 Py_DECREF(left);
11080 Py_INCREF(right);
11081 *p_left = right;
11082 return;
11083 }
11084 if (right == unicode_empty)
11085 return;
11086
11087 left_len = PyUnicode_GET_LENGTH(left);
11088 right_len = PyUnicode_GET_LENGTH(right);
11089 if (left_len > PY_SSIZE_T_MAX - right_len) {
11090 PyErr_SetString(PyExc_OverflowError,
11091 "strings are too large to concat");
11092 goto error;
11093 }
11094 new_len = left_len + right_len;
11095
11096 if (unicode_modifiable(left)
11097 && PyUnicode_CheckExact(right)
11098 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020011099 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11100 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020011101 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020011102 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010011103 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11104 {
11105 /* append inplace */
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011106 if (unicode_resize(p_left, new_len) != 0)
Victor Stinner488fa492011-12-12 00:01:39 +010011107 goto error;
Victor Stinnerf0335102013-04-14 19:13:03 +020011108
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011109 /* copy 'right' into the newly allocated area of 'left' */
11110 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020011111 }
Victor Stinner488fa492011-12-12 00:01:39 +010011112 else {
11113 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11114 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011115 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020011116
Victor Stinner488fa492011-12-12 00:01:39 +010011117 /* Concat the two Unicode strings */
11118 res = PyUnicode_New(new_len, maxchar);
11119 if (res == NULL)
11120 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020011121 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11122 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010011123 Py_DECREF(left);
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011124 *p_left = res;
Victor Stinner488fa492011-12-12 00:01:39 +010011125 }
11126 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020011127 return;
11128
11129error:
Victor Stinner488fa492011-12-12 00:01:39 +010011130 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011131}
11132
11133void
11134PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11135{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011136 PyUnicode_Append(pleft, right);
11137 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011138}
11139
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011140PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011141 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011142\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011143Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011144string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011145interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011146
11147static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011148unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011149{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011150 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011151 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011152 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011153 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011154 int kind1, kind2, kind;
11155 void *buf1, *buf2;
11156 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011157
Jesus Ceaac451502011-04-20 17:09:23 +020011158 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
11159 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011160 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000011161
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011162 kind1 = PyUnicode_KIND(self);
11163 kind2 = PyUnicode_KIND(substring);
Christian Heimesd47802e2013-06-29 21:33:36 +020011164 if (kind2 > kind1) {
11165 Py_DECREF(substring);
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040011166 return PyLong_FromLong(0);
Christian Heimesd47802e2013-06-29 21:33:36 +020011167 }
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040011168 kind = kind1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011169 buf1 = PyUnicode_DATA(self);
11170 buf2 = PyUnicode_DATA(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011171 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010011172 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011173 if (!buf2) {
11174 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011175 return NULL;
11176 }
11177 len1 = PyUnicode_GET_LENGTH(self);
11178 len2 = PyUnicode_GET_LENGTH(substring);
11179
11180 ADJUST_INDICES(start, end, len1);
Benjamin Petersonead6b532011-12-20 17:23:42 -060011181 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011182 case PyUnicode_1BYTE_KIND:
11183 iresult = ucs1lib_count(
11184 ((Py_UCS1*)buf1) + start, end - start,
11185 buf2, len2, PY_SSIZE_T_MAX
11186 );
11187 break;
11188 case PyUnicode_2BYTE_KIND:
11189 iresult = ucs2lib_count(
11190 ((Py_UCS2*)buf1) + start, end - start,
11191 buf2, len2, PY_SSIZE_T_MAX
11192 );
11193 break;
11194 case PyUnicode_4BYTE_KIND:
11195 iresult = ucs4lib_count(
11196 ((Py_UCS4*)buf1) + start, end - start,
11197 buf2, len2, PY_SSIZE_T_MAX
11198 );
11199 break;
11200 default:
11201 assert(0); iresult = 0;
11202 }
11203
11204 result = PyLong_FromSsize_t(iresult);
11205
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011206 if (kind2 != kind)
11207 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011208
11209 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011210
Guido van Rossumd57fd912000-03-10 22:53:23 +000011211 return result;
11212}
11213
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011214PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000011215 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011216\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000011217Encode S using the codec registered for encoding. Default encoding\n\
11218is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000011219handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000011220a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
11221'xmlcharrefreplace' as well as any other name registered with\n\
11222codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011223
11224static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011225unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011226{
Benjamin Peterson308d6372009-09-18 21:42:35 +000011227 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000011228 char *encoding = NULL;
11229 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000011230
Benjamin Peterson308d6372009-09-18 21:42:35 +000011231 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
11232 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011233 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011234 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011235}
11236
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011237PyDoc_STRVAR(expandtabs__doc__,
Ezio Melotti745d54d2013-11-16 19:10:57 +020011238 "S.expandtabs(tabsize=8) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011239\n\
11240Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011241If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011242
11243static PyObject*
Ezio Melotti745d54d2013-11-16 19:10:57 +020011244unicode_expandtabs(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011245{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011246 Py_ssize_t i, j, line_pos, src_len, incr;
11247 Py_UCS4 ch;
11248 PyObject *u;
11249 void *src_data, *dest_data;
Ezio Melotti745d54d2013-11-16 19:10:57 +020011250 static char *kwlist[] = {"tabsize", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000011251 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011252 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011253 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011254
Ezio Melotti745d54d2013-11-16 19:10:57 +020011255 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:expandtabs",
11256 kwlist, &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000011257 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011258
Antoine Pitrou22425222011-10-04 19:10:51 +020011259 if (PyUnicode_READY(self) == -1)
11260 return NULL;
11261
Thomas Wouters7e474022000-07-16 12:04:32 +000011262 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011263 src_len = PyUnicode_GET_LENGTH(self);
11264 i = j = line_pos = 0;
11265 kind = PyUnicode_KIND(self);
11266 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011267 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011268 for (; i < src_len; i++) {
11269 ch = PyUnicode_READ(kind, src_data, i);
11270 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011271 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011272 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011273 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011274 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011275 goto overflow;
11276 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011277 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011278 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011279 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011280 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011281 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011282 goto overflow;
11283 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011284 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011285 if (ch == '\n' || ch == '\r')
11286 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011287 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011288 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011289 if (!found)
11290 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011291
Guido van Rossumd57fd912000-03-10 22:53:23 +000011292 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011293 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011294 if (!u)
11295 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011296 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011297
Antoine Pitroue71d5742011-10-04 15:55:09 +020011298 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011299
Antoine Pitroue71d5742011-10-04 15:55:09 +020011300 for (; i < src_len; i++) {
11301 ch = PyUnicode_READ(kind, src_data, i);
11302 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011303 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011304 incr = tabsize - (line_pos % tabsize);
11305 line_pos += incr;
Victor Stinnerda79e632012-02-22 13:37:04 +010011306 FILL(kind, dest_data, ' ', j, incr);
11307 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011308 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011309 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011310 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011311 line_pos++;
11312 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011313 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011314 if (ch == '\n' || ch == '\r')
11315 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011316 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011317 }
11318 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011319 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011320
Antoine Pitroue71d5742011-10-04 15:55:09 +020011321 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011322 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11323 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011324}
11325
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011326PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011327 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011328\n\
11329Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011330such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011331arguments start and end are interpreted as in slice notation.\n\
11332\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011333Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011334
11335static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011336unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011337{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011338 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011339 Py_ssize_t start;
11340 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011341 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011342
Jesus Ceaac451502011-04-20 17:09:23 +020011343 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
11344 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011345 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011346
Christian Heimesd47802e2013-06-29 21:33:36 +020011347 if (PyUnicode_READY(self) == -1) {
11348 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011349 return NULL;
Christian Heimesd47802e2013-06-29 21:33:36 +020011350 }
11351 if (PyUnicode_READY(substring) == -1) {
11352 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011353 return NULL;
Christian Heimesd47802e2013-06-29 21:33:36 +020011354 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011355
Victor Stinner7931d9a2011-11-04 00:22:48 +010011356 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011357
11358 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011359
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011360 if (result == -2)
11361 return NULL;
11362
Christian Heimes217cfd12007-12-02 14:31:20 +000011363 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011364}
11365
11366static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011367unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011368{
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011369 void *data;
11370 enum PyUnicode_Kind kind;
11371 Py_UCS4 ch;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011372
11373 if (!PyUnicode_Check(self) || PyUnicode_READY(self) == -1) {
11374 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011375 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011376 }
11377 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11378 PyErr_SetString(PyExc_IndexError, "string index out of range");
11379 return NULL;
11380 }
11381 kind = PyUnicode_KIND(self);
11382 data = PyUnicode_DATA(self);
11383 ch = PyUnicode_READ(kind, data, index);
Victor Stinner985a82a2014-01-03 12:53:47 +010011384 return unicode_char(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011385}
11386
Guido van Rossumc2504932007-09-18 19:42:40 +000011387/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011388 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011389static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011390unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011391{
Guido van Rossumc2504932007-09-18 19:42:40 +000011392 Py_ssize_t len;
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080011393 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000011394
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011395#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011396 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011397#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011398 if (_PyUnicode_HASH(self) != -1)
11399 return _PyUnicode_HASH(self);
11400 if (PyUnicode_READY(self) == -1)
11401 return -1;
11402 len = PyUnicode_GET_LENGTH(self);
Georg Brandl16fa2a12012-02-21 00:50:13 +010011403 /*
11404 We make the hash of the empty string be 0, rather than using
11405 (prefix ^ suffix), since this slightly obfuscates the hash secret
11406 */
11407 if (len == 0) {
11408 _PyUnicode_HASH(self) = 0;
11409 return 0;
11410 }
Christian Heimes985ecdc2013-11-20 11:46:18 +010011411 x = _Py_HashBytes(PyUnicode_DATA(self),
11412 PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011413 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011414 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011415}
11416
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011417PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011418 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011419\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011420Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011421
11422static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011423unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011424{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011425 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011426 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011427 Py_ssize_t start;
11428 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011429
Jesus Ceaac451502011-04-20 17:09:23 +020011430 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
11431 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011432 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011433
Christian Heimesd47a0452013-06-29 21:21:37 +020011434 if (PyUnicode_READY(self) == -1) {
11435 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011436 return NULL;
Christian Heimesd47a0452013-06-29 21:21:37 +020011437 }
11438 if (PyUnicode_READY(substring) == -1) {
11439 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011440 return NULL;
Christian Heimesd47a0452013-06-29 21:21:37 +020011441 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011442
Victor Stinner7931d9a2011-11-04 00:22:48 +010011443 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011444
11445 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011446
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011447 if (result == -2)
11448 return NULL;
11449
Guido van Rossumd57fd912000-03-10 22:53:23 +000011450 if (result < 0) {
11451 PyErr_SetString(PyExc_ValueError, "substring not found");
11452 return NULL;
11453 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011454
Christian Heimes217cfd12007-12-02 14:31:20 +000011455 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011456}
11457
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011458PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011459 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011460\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011461Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011462at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011463
11464static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011465unicode_islower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011466{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011467 Py_ssize_t i, length;
11468 int kind;
11469 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011470 int cased;
11471
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011472 if (PyUnicode_READY(self) == -1)
11473 return NULL;
11474 length = PyUnicode_GET_LENGTH(self);
11475 kind = PyUnicode_KIND(self);
11476 data = PyUnicode_DATA(self);
11477
Guido van Rossumd57fd912000-03-10 22:53:23 +000011478 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011479 if (length == 1)
11480 return PyBool_FromLong(
11481 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011482
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011483 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011484 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011485 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011486
Guido van Rossumd57fd912000-03-10 22:53:23 +000011487 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011488 for (i = 0; i < length; i++) {
11489 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011490
Benjamin Peterson29060642009-01-31 22:14:21 +000011491 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11492 return PyBool_FromLong(0);
11493 else if (!cased && Py_UNICODE_ISLOWER(ch))
11494 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011495 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011496 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011497}
11498
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011499PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011500 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011501\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011502Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011503at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011504
11505static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011506unicode_isupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011507{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011508 Py_ssize_t i, length;
11509 int kind;
11510 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011511 int cased;
11512
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011513 if (PyUnicode_READY(self) == -1)
11514 return NULL;
11515 length = PyUnicode_GET_LENGTH(self);
11516 kind = PyUnicode_KIND(self);
11517 data = PyUnicode_DATA(self);
11518
Guido van Rossumd57fd912000-03-10 22:53:23 +000011519 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011520 if (length == 1)
11521 return PyBool_FromLong(
11522 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011523
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011524 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011525 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011526 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011527
Guido van Rossumd57fd912000-03-10 22:53:23 +000011528 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011529 for (i = 0; i < length; i++) {
11530 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011531
Benjamin Peterson29060642009-01-31 22:14:21 +000011532 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11533 return PyBool_FromLong(0);
11534 else if (!cased && Py_UNICODE_ISUPPER(ch))
11535 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011536 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011537 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011538}
11539
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011540PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011541 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011542\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011543Return True if S is a titlecased string and there is at least one\n\
11544character in S, i.e. upper- and titlecase characters may only\n\
11545follow uncased characters and lowercase characters only cased ones.\n\
11546Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011547
11548static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011549unicode_istitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011550{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011551 Py_ssize_t i, length;
11552 int kind;
11553 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011554 int cased, previous_is_cased;
11555
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011556 if (PyUnicode_READY(self) == -1)
11557 return NULL;
11558 length = PyUnicode_GET_LENGTH(self);
11559 kind = PyUnicode_KIND(self);
11560 data = PyUnicode_DATA(self);
11561
Guido van Rossumd57fd912000-03-10 22:53:23 +000011562 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011563 if (length == 1) {
11564 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11565 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11566 (Py_UNICODE_ISUPPER(ch) != 0));
11567 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011568
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011569 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011570 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011571 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011572
Guido van Rossumd57fd912000-03-10 22:53:23 +000011573 cased = 0;
11574 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011575 for (i = 0; i < length; i++) {
11576 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011577
Benjamin Peterson29060642009-01-31 22:14:21 +000011578 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11579 if (previous_is_cased)
11580 return PyBool_FromLong(0);
11581 previous_is_cased = 1;
11582 cased = 1;
11583 }
11584 else if (Py_UNICODE_ISLOWER(ch)) {
11585 if (!previous_is_cased)
11586 return PyBool_FromLong(0);
11587 previous_is_cased = 1;
11588 cased = 1;
11589 }
11590 else
11591 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011592 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011593 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011594}
11595
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011596PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011597 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011598\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011599Return True if all characters in S are whitespace\n\
11600and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011601
11602static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011603unicode_isspace(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011604{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011605 Py_ssize_t i, length;
11606 int kind;
11607 void *data;
11608
11609 if (PyUnicode_READY(self) == -1)
11610 return NULL;
11611 length = PyUnicode_GET_LENGTH(self);
11612 kind = PyUnicode_KIND(self);
11613 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011614
Guido van Rossumd57fd912000-03-10 22:53:23 +000011615 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011616 if (length == 1)
11617 return PyBool_FromLong(
11618 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011619
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011620 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011621 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011622 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011623
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011624 for (i = 0; i < length; i++) {
11625 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011626 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011627 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011628 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011629 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011630}
11631
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011632PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011633 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011634\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011635Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011636and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011637
11638static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011639unicode_isalpha(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011640{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011641 Py_ssize_t i, length;
11642 int kind;
11643 void *data;
11644
11645 if (PyUnicode_READY(self) == -1)
11646 return NULL;
11647 length = PyUnicode_GET_LENGTH(self);
11648 kind = PyUnicode_KIND(self);
11649 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011650
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011651 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011652 if (length == 1)
11653 return PyBool_FromLong(
11654 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011655
11656 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011657 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011658 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011659
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011660 for (i = 0; i < length; i++) {
11661 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011662 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011663 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011664 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011665}
11666
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011667PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011668 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011669\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011670Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011671and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011672
11673static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011674unicode_isalnum(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011675{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011676 int kind;
11677 void *data;
11678 Py_ssize_t len, i;
11679
11680 if (PyUnicode_READY(self) == -1)
11681 return NULL;
11682
11683 kind = PyUnicode_KIND(self);
11684 data = PyUnicode_DATA(self);
11685 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011686
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011687 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011688 if (len == 1) {
11689 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11690 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11691 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011692
11693 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011694 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011695 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011696
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011697 for (i = 0; i < len; i++) {
11698 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011699 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011700 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011701 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011702 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011703}
11704
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011705PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011706 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011707\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011708Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011709False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011710
11711static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011712unicode_isdecimal(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011713{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011714 Py_ssize_t i, length;
11715 int kind;
11716 void *data;
11717
11718 if (PyUnicode_READY(self) == -1)
11719 return NULL;
11720 length = PyUnicode_GET_LENGTH(self);
11721 kind = PyUnicode_KIND(self);
11722 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011723
Guido van Rossumd57fd912000-03-10 22:53:23 +000011724 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011725 if (length == 1)
11726 return PyBool_FromLong(
11727 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011728
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011729 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011730 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011731 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011732
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011733 for (i = 0; i < length; i++) {
11734 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011735 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011736 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011737 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011738}
11739
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011740PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011741 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011742\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011743Return True if all characters in S are digits\n\
11744and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011745
11746static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011747unicode_isdigit(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011748{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011749 Py_ssize_t i, length;
11750 int kind;
11751 void *data;
11752
11753 if (PyUnicode_READY(self) == -1)
11754 return NULL;
11755 length = PyUnicode_GET_LENGTH(self);
11756 kind = PyUnicode_KIND(self);
11757 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011758
Guido van Rossumd57fd912000-03-10 22:53:23 +000011759 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011760 if (length == 1) {
11761 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11762 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11763 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011764
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011765 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011766 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011767 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011768
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011769 for (i = 0; i < length; i++) {
11770 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011771 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011772 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011773 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011774}
11775
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011776PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011777 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011778\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011779Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011780False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011781
11782static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011783unicode_isnumeric(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011784{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011785 Py_ssize_t i, length;
11786 int kind;
11787 void *data;
11788
11789 if (PyUnicode_READY(self) == -1)
11790 return NULL;
11791 length = PyUnicode_GET_LENGTH(self);
11792 kind = PyUnicode_KIND(self);
11793 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011794
Guido van Rossumd57fd912000-03-10 22:53:23 +000011795 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011796 if (length == 1)
11797 return PyBool_FromLong(
11798 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011799
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011800 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011801 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011802 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011803
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011804 for (i = 0; i < length; i++) {
11805 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011806 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011807 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011808 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011809}
11810
Martin v. Löwis47383402007-08-15 07:32:56 +000011811int
11812PyUnicode_IsIdentifier(PyObject *self)
11813{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011814 int kind;
11815 void *data;
11816 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011817 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000011818
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011819 if (PyUnicode_READY(self) == -1) {
11820 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000011821 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011822 }
11823
11824 /* Special case for empty strings */
11825 if (PyUnicode_GET_LENGTH(self) == 0)
11826 return 0;
11827 kind = PyUnicode_KIND(self);
11828 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000011829
11830 /* PEP 3131 says that the first character must be in
11831 XID_Start and subsequent characters in XID_Continue,
11832 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000011833 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000011834 letters, digits, underscore). However, given the current
11835 definition of XID_Start and XID_Continue, it is sufficient
11836 to check just for these, except that _ must be allowed
11837 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011838 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050011839 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000011840 return 0;
11841
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040011842 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011843 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011844 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000011845 return 1;
11846}
11847
11848PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011849 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000011850\n\
11851Return True if S is a valid identifier according\n\
Raymond Hettinger378170d2013-03-23 08:21:12 -070011852to the language definition.\n\
11853\n\
11854Use keyword.iskeyword() to test for reserved identifiers\n\
11855such as \"def\" and \"class\".\n");
Martin v. Löwis47383402007-08-15 07:32:56 +000011856
11857static PyObject*
11858unicode_isidentifier(PyObject *self)
11859{
11860 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11861}
11862
Georg Brandl559e5d72008-06-11 18:37:52 +000011863PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011864 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000011865\n\
11866Return True if all characters in S are considered\n\
11867printable in repr() or S is empty, False otherwise.");
11868
11869static PyObject*
11870unicode_isprintable(PyObject *self)
11871{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011872 Py_ssize_t i, length;
11873 int kind;
11874 void *data;
11875
11876 if (PyUnicode_READY(self) == -1)
11877 return NULL;
11878 length = PyUnicode_GET_LENGTH(self);
11879 kind = PyUnicode_KIND(self);
11880 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000011881
11882 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011883 if (length == 1)
11884 return PyBool_FromLong(
11885 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000011886
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011887 for (i = 0; i < length; i++) {
11888 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011889 Py_RETURN_FALSE;
11890 }
11891 }
11892 Py_RETURN_TRUE;
11893}
11894
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011895PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000011896 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011897\n\
11898Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000011899iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011900
11901static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011902unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011903{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011904 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011905}
11906
Martin v. Löwis18e16552006-02-15 17:27:45 +000011907static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011908unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011909{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011910 if (PyUnicode_READY(self) == -1)
11911 return -1;
11912 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011913}
11914
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011915PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011916 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011917\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011918Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011919done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011920
11921static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011922unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011923{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011924 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011925 Py_UCS4 fillchar = ' ';
11926
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011927 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011928 return NULL;
11929
Benjamin Petersonbac79492012-01-14 13:34:47 -050011930 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010011931 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011932
Victor Stinnerc4b49542011-12-11 22:44:26 +010011933 if (PyUnicode_GET_LENGTH(self) >= width)
11934 return unicode_result_unchanged(self);
11935
11936 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011937}
11938
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011939PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011940 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011941\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011942Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011943
11944static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011945unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011946{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050011947 if (PyUnicode_READY(self) == -1)
11948 return NULL;
11949 if (PyUnicode_IS_ASCII(self))
11950 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010011951 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011952}
11953
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011954#define LEFTSTRIP 0
11955#define RIGHTSTRIP 1
11956#define BOTHSTRIP 2
11957
11958/* Arrays indexed by above */
11959static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
11960
11961#define STRIPNAME(i) (stripformat[i]+3)
11962
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011963/* externally visible for str.strip(unicode) */
11964PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011965_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011966{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011967 void *data;
11968 int kind;
11969 Py_ssize_t i, j, len;
11970 BLOOM_MASK sepmask;
Victor Stinnerb3a60142013-04-09 22:19:21 +020011971 Py_ssize_t seplen;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011972
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011973 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
11974 return NULL;
11975
11976 kind = PyUnicode_KIND(self);
11977 data = PyUnicode_DATA(self);
11978 len = PyUnicode_GET_LENGTH(self);
Victor Stinnerb3a60142013-04-09 22:19:21 +020011979 seplen = PyUnicode_GET_LENGTH(sepobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011980 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11981 PyUnicode_DATA(sepobj),
Victor Stinnerb3a60142013-04-09 22:19:21 +020011982 seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011983
Benjamin Peterson14339b62009-01-31 16:36:08 +000011984 i = 0;
11985 if (striptype != RIGHTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020011986 while (i < len) {
11987 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11988 if (!BLOOM(sepmask, ch))
11989 break;
11990 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
11991 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000011992 i++;
11993 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011994 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011995
Benjamin Peterson14339b62009-01-31 16:36:08 +000011996 j = len;
11997 if (striptype != LEFTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020011998 j--;
11999 while (j >= i) {
12000 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12001 if (!BLOOM(sepmask, ch))
12002 break;
12003 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12004 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012005 j--;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012006 }
12007
Benjamin Peterson29060642009-01-31 22:14:21 +000012008 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012009 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012010
Victor Stinner7931d9a2011-11-04 00:22:48 +010012011 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012012}
12013
12014PyObject*
12015PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12016{
12017 unsigned char *data;
12018 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020012019 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012020
Victor Stinnerde636f32011-10-01 03:55:54 +020012021 if (PyUnicode_READY(self) == -1)
12022 return NULL;
12023
Victor Stinner684d5fd2012-05-03 02:32:34 +020012024 length = PyUnicode_GET_LENGTH(self);
12025 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020012026
Victor Stinner684d5fd2012-05-03 02:32:34 +020012027 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012028 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012029
Victor Stinnerde636f32011-10-01 03:55:54 +020012030 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012031 PyErr_SetString(PyExc_IndexError, "string index out of range");
12032 return NULL;
12033 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020012034 if (start >= length || end < start)
12035 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner12bab6d2011-10-01 01:53:49 +020012036
Victor Stinner684d5fd2012-05-03 02:32:34 +020012037 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020012038 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020012039 data = PyUnicode_1BYTE_DATA(self);
Victor Stinnerd3f08822012-05-29 12:57:52 +020012040 return _PyUnicode_FromASCII((char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020012041 }
12042 else {
12043 kind = PyUnicode_KIND(self);
12044 data = PyUnicode_1BYTE_DATA(self);
12045 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012046 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020012047 length);
12048 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012049}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012050
12051static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012052do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012053{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012054 Py_ssize_t len, i, j;
12055
12056 if (PyUnicode_READY(self) == -1)
12057 return NULL;
12058
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012059 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012060
Victor Stinnercc7af722013-04-09 22:39:24 +020012061 if (PyUnicode_IS_ASCII(self)) {
12062 Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12063
12064 i = 0;
12065 if (striptype != RIGHTSTRIP) {
12066 while (i < len) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012067 Py_UCS1 ch = data[i];
Victor Stinnercc7af722013-04-09 22:39:24 +020012068 if (!_Py_ascii_whitespace[ch])
12069 break;
12070 i++;
12071 }
12072 }
12073
12074 j = len;
12075 if (striptype != LEFTSTRIP) {
12076 j--;
12077 while (j >= i) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012078 Py_UCS1 ch = data[j];
Victor Stinnercc7af722013-04-09 22:39:24 +020012079 if (!_Py_ascii_whitespace[ch])
12080 break;
12081 j--;
12082 }
12083 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012084 }
12085 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012086 else {
12087 int kind = PyUnicode_KIND(self);
12088 void *data = PyUnicode_DATA(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012089
Victor Stinnercc7af722013-04-09 22:39:24 +020012090 i = 0;
12091 if (striptype != RIGHTSTRIP) {
12092 while (i < len) {
12093 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12094 if (!Py_UNICODE_ISSPACE(ch))
12095 break;
12096 i++;
12097 }
Victor Stinner9c79e412013-04-09 22:21:08 +020012098 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012099
12100 j = len;
12101 if (striptype != LEFTSTRIP) {
12102 j--;
12103 while (j >= i) {
12104 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12105 if (!Py_UNICODE_ISSPACE(ch))
12106 break;
12107 j--;
12108 }
12109 j++;
12110 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012111 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012112
Victor Stinner7931d9a2011-11-04 00:22:48 +010012113 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012114}
12115
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012116
12117static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012118do_argstrip(PyObject *self, int striptype, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012119{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012120 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012121
Serhiy Storchakac6792272013-10-19 21:03:34 +030012122 if (!PyArg_ParseTuple(args, stripformat[striptype], &sep))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012123 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012124
Benjamin Peterson14339b62009-01-31 16:36:08 +000012125 if (sep != NULL && sep != Py_None) {
12126 if (PyUnicode_Check(sep))
12127 return _PyUnicode_XStrip(self, striptype, sep);
12128 else {
12129 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012130 "%s arg must be None or str",
12131 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012132 return NULL;
12133 }
12134 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012135
Benjamin Peterson14339b62009-01-31 16:36:08 +000012136 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012137}
12138
12139
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012140PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012141 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012142\n\
12143Return a copy of the string S with leading and trailing\n\
12144whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012145If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012146
12147static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012148unicode_strip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012149{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012150 if (PyTuple_GET_SIZE(args) == 0)
12151 return do_strip(self, BOTHSTRIP); /* Common case */
12152 else
12153 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012154}
12155
12156
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012157PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012158 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012159\n\
12160Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012161If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012162
12163static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012164unicode_lstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012165{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012166 if (PyTuple_GET_SIZE(args) == 0)
12167 return do_strip(self, LEFTSTRIP); /* Common case */
12168 else
12169 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012170}
12171
12172
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012173PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012174 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012175\n\
12176Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012177If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012178
12179static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012180unicode_rstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012181{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012182 if (PyTuple_GET_SIZE(args) == 0)
12183 return do_strip(self, RIGHTSTRIP); /* Common case */
12184 else
12185 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012186}
12187
12188
Guido van Rossumd57fd912000-03-10 22:53:23 +000012189static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012190unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012191{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012192 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012193 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012194
Serhiy Storchaka05997252013-01-26 12:14:02 +020012195 if (len < 1)
12196 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012197
Victor Stinnerc4b49542011-12-11 22:44:26 +010012198 /* no repeat, return original string */
12199 if (len == 1)
12200 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000012201
Benjamin Petersonbac79492012-01-14 13:34:47 -050012202 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012203 return NULL;
12204
Victor Stinnerc759f3e2011-10-01 03:09:58 +020012205 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020012206 PyErr_SetString(PyExc_OverflowError,
12207 "repeated string is too long");
12208 return NULL;
12209 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012210 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012211
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012212 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012213 if (!u)
12214 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012215 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012216
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012217 if (PyUnicode_GET_LENGTH(str) == 1) {
12218 const int kind = PyUnicode_KIND(str);
12219 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010012220 if (kind == PyUnicode_1BYTE_KIND) {
12221 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012222 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010012223 }
12224 else if (kind == PyUnicode_2BYTE_KIND) {
12225 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012226 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010012227 ucs2[n] = fill_char;
12228 } else {
12229 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12230 assert(kind == PyUnicode_4BYTE_KIND);
12231 for (n = 0; n < len; ++n)
12232 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012233 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012234 }
12235 else {
12236 /* number of characters copied this far */
12237 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012238 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012239 char *to = (char *) PyUnicode_DATA(u);
12240 Py_MEMCPY(to, PyUnicode_DATA(str),
12241 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000012242 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012243 n = (done <= nchars-done) ? done : nchars-done;
12244 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012245 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000012246 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012247 }
12248
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012249 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012250 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012251}
12252
Alexander Belopolsky40018472011-02-26 01:02:56 +000012253PyObject *
12254PyUnicode_Replace(PyObject *obj,
12255 PyObject *subobj,
12256 PyObject *replobj,
12257 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012258{
12259 PyObject *self;
12260 PyObject *str1;
12261 PyObject *str2;
12262 PyObject *result;
12263
12264 self = PyUnicode_FromObject(obj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012265 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012266 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012267 str1 = PyUnicode_FromObject(subobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012268 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012269 Py_DECREF(self);
12270 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012271 }
12272 str2 = PyUnicode_FromObject(replobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012273 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012274 Py_DECREF(self);
12275 Py_DECREF(str1);
12276 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012277 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060012278 if (PyUnicode_READY(self) == -1 ||
12279 PyUnicode_READY(str1) == -1 ||
12280 PyUnicode_READY(str2) == -1)
12281 result = NULL;
12282 else
12283 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012284 Py_DECREF(self);
12285 Py_DECREF(str1);
12286 Py_DECREF(str2);
12287 return result;
12288}
12289
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012290PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000012291 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012292\n\
12293Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000012294old replaced by new. If the optional argument count is\n\
12295given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012296
12297static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012298unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012299{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012300 PyObject *str1;
12301 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012302 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012303 PyObject *result;
12304
Martin v. Löwis18e16552006-02-15 17:27:45 +000012305 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012306 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060012307 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012308 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012309 str1 = PyUnicode_FromObject(str1);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012310 if (str1 == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012311 return NULL;
12312 str2 = PyUnicode_FromObject(str2);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012313 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012314 Py_DECREF(str1);
12315 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000012316 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060012317 if (PyUnicode_READY(str1) == -1 || PyUnicode_READY(str2) == -1)
12318 result = NULL;
12319 else
12320 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012321
12322 Py_DECREF(str1);
12323 Py_DECREF(str2);
12324 return result;
12325}
12326
Alexander Belopolsky40018472011-02-26 01:02:56 +000012327static PyObject *
12328unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012329{
Walter Dörwald79e913e2007-05-12 11:08:06 +000012330 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012331 Py_ssize_t isize;
12332 Py_ssize_t osize, squote, dquote, i, o;
12333 Py_UCS4 max, quote;
Victor Stinner55c08782013-04-14 18:45:39 +020012334 int ikind, okind, unchanged;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012335 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012336
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012337 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012338 return NULL;
12339
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012340 isize = PyUnicode_GET_LENGTH(unicode);
12341 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012342
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012343 /* Compute length of output, quote characters, and
12344 maximum character */
Victor Stinner55c08782013-04-14 18:45:39 +020012345 osize = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012346 max = 127;
12347 squote = dquote = 0;
12348 ikind = PyUnicode_KIND(unicode);
12349 for (i = 0; i < isize; i++) {
12350 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Benjamin Peterson736b8012014-09-29 23:02:15 -040012351 Py_ssize_t incr = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012352 switch (ch) {
Benjamin Peterson736b8012014-09-29 23:02:15 -040012353 case '\'': squote++; break;
12354 case '"': dquote++; break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012355 case '\\': case '\t': case '\r': case '\n':
Benjamin Peterson736b8012014-09-29 23:02:15 -040012356 incr = 2;
12357 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012358 default:
12359 /* Fast-path ASCII */
12360 if (ch < ' ' || ch == 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012361 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012362 else if (ch < 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012363 ;
12364 else if (Py_UNICODE_ISPRINTABLE(ch))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012365 max = ch > max ? ch : max;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012366 else if (ch < 0x100)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012367 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012368 else if (ch < 0x10000)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012369 incr = 6; /* \uHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012370 else
Benjamin Peterson736b8012014-09-29 23:02:15 -040012371 incr = 10; /* \uHHHHHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012372 }
Benjamin Peterson736b8012014-09-29 23:02:15 -040012373 if (osize > PY_SSIZE_T_MAX - incr) {
12374 PyErr_SetString(PyExc_OverflowError,
12375 "string is too long to generate repr");
12376 return NULL;
12377 }
12378 osize += incr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012379 }
12380
12381 quote = '\'';
Victor Stinner55c08782013-04-14 18:45:39 +020012382 unchanged = (osize == isize);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012383 if (squote) {
Victor Stinner55c08782013-04-14 18:45:39 +020012384 unchanged = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012385 if (dquote)
12386 /* Both squote and dquote present. Use squote,
12387 and escape them */
12388 osize += squote;
12389 else
12390 quote = '"';
12391 }
Victor Stinner55c08782013-04-14 18:45:39 +020012392 osize += 2; /* quotes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012393
12394 repr = PyUnicode_New(osize, max);
12395 if (repr == NULL)
12396 return NULL;
12397 okind = PyUnicode_KIND(repr);
12398 odata = PyUnicode_DATA(repr);
12399
12400 PyUnicode_WRITE(okind, odata, 0, quote);
12401 PyUnicode_WRITE(okind, odata, osize-1, quote);
Victor Stinner55c08782013-04-14 18:45:39 +020012402 if (unchanged) {
12403 _PyUnicode_FastCopyCharacters(repr, 1,
12404 unicode, 0,
12405 isize);
12406 }
12407 else {
12408 for (i = 0, o = 1; i < isize; i++) {
12409 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012410
Victor Stinner55c08782013-04-14 18:45:39 +020012411 /* Escape quotes and backslashes */
12412 if ((ch == quote) || (ch == '\\')) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012413 PyUnicode_WRITE(okind, odata, o++, '\\');
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012414 PyUnicode_WRITE(okind, odata, o++, ch);
Victor Stinner55c08782013-04-14 18:45:39 +020012415 continue;
12416 }
12417
12418 /* Map special whitespace to '\t', \n', '\r' */
12419 if (ch == '\t') {
12420 PyUnicode_WRITE(okind, odata, o++, '\\');
12421 PyUnicode_WRITE(okind, odata, o++, 't');
12422 }
12423 else if (ch == '\n') {
12424 PyUnicode_WRITE(okind, odata, o++, '\\');
12425 PyUnicode_WRITE(okind, odata, o++, 'n');
12426 }
12427 else if (ch == '\r') {
12428 PyUnicode_WRITE(okind, odata, o++, '\\');
12429 PyUnicode_WRITE(okind, odata, o++, 'r');
12430 }
12431
12432 /* Map non-printable US ASCII to '\xhh' */
12433 else if (ch < ' ' || ch == 0x7F) {
12434 PyUnicode_WRITE(okind, odata, o++, '\\');
12435 PyUnicode_WRITE(okind, odata, o++, 'x');
12436 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12437 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12438 }
12439
12440 /* Copy ASCII characters as-is */
12441 else if (ch < 0x7F) {
12442 PyUnicode_WRITE(okind, odata, o++, ch);
12443 }
12444
12445 /* Non-ASCII characters */
12446 else {
12447 /* Map Unicode whitespace and control characters
12448 (categories Z* and C* except ASCII space)
12449 */
12450 if (!Py_UNICODE_ISPRINTABLE(ch)) {
12451 PyUnicode_WRITE(okind, odata, o++, '\\');
12452 /* Map 8-bit characters to '\xhh' */
12453 if (ch <= 0xff) {
12454 PyUnicode_WRITE(okind, odata, o++, 'x');
12455 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12456 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12457 }
12458 /* Map 16-bit characters to '\uxxxx' */
12459 else if (ch <= 0xffff) {
12460 PyUnicode_WRITE(okind, odata, o++, 'u');
12461 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12462 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12463 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12464 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12465 }
12466 /* Map 21-bit characters to '\U00xxxxxx' */
12467 else {
12468 PyUnicode_WRITE(okind, odata, o++, 'U');
12469 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12470 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12471 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12472 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12473 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12474 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12475 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12476 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12477 }
12478 }
12479 /* Copy characters as-is */
12480 else {
12481 PyUnicode_WRITE(okind, odata, o++, ch);
12482 }
Georg Brandl559e5d72008-06-11 18:37:52 +000012483 }
12484 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012485 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012486 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012487 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012488 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012489}
12490
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012491PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012492 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012493\n\
12494Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012495such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012496arguments start and end are interpreted as in slice notation.\n\
12497\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012498Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012499
12500static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012501unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012502{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012503 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012504 Py_ssize_t start;
12505 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012506 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012507
Jesus Ceaac451502011-04-20 17:09:23 +020012508 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
12509 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012510 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012511
Christian Heimesea71a522013-06-29 21:17:34 +020012512 if (PyUnicode_READY(self) == -1) {
12513 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012514 return NULL;
Christian Heimesea71a522013-06-29 21:17:34 +020012515 }
12516 if (PyUnicode_READY(substring) == -1) {
12517 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012518 return NULL;
Christian Heimesea71a522013-06-29 21:17:34 +020012519 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012520
Victor Stinner7931d9a2011-11-04 00:22:48 +010012521 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012522
12523 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012524
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012525 if (result == -2)
12526 return NULL;
12527
Christian Heimes217cfd12007-12-02 14:31:20 +000012528 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012529}
12530
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012531PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012532 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012533\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012534Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012535
12536static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012537unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012538{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012539 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012540 Py_ssize_t start;
12541 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012542 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012543
Jesus Ceaac451502011-04-20 17:09:23 +020012544 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
12545 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012546 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012547
Christian Heimesea71a522013-06-29 21:17:34 +020012548 if (PyUnicode_READY(self) == -1) {
12549 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012550 return NULL;
Christian Heimesea71a522013-06-29 21:17:34 +020012551 }
12552 if (PyUnicode_READY(substring) == -1) {
12553 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012554 return NULL;
Christian Heimesea71a522013-06-29 21:17:34 +020012555 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012556
Victor Stinner7931d9a2011-11-04 00:22:48 +010012557 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012558
12559 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012560
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012561 if (result == -2)
12562 return NULL;
12563
Guido van Rossumd57fd912000-03-10 22:53:23 +000012564 if (result < 0) {
12565 PyErr_SetString(PyExc_ValueError, "substring not found");
12566 return NULL;
12567 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012568
Christian Heimes217cfd12007-12-02 14:31:20 +000012569 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012570}
12571
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012572PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012573 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012574\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012575Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012576done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012577
12578static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012579unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012580{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012581 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012582 Py_UCS4 fillchar = ' ';
12583
Victor Stinnere9a29352011-10-01 02:14:59 +020012584 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012585 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012586
Benjamin Petersonbac79492012-01-14 13:34:47 -050012587 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012588 return NULL;
12589
Victor Stinnerc4b49542011-12-11 22:44:26 +010012590 if (PyUnicode_GET_LENGTH(self) >= width)
12591 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012592
Victor Stinnerc4b49542011-12-11 22:44:26 +010012593 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012594}
12595
Alexander Belopolsky40018472011-02-26 01:02:56 +000012596PyObject *
12597PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012598{
12599 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000012600
Guido van Rossumd57fd912000-03-10 22:53:23 +000012601 s = PyUnicode_FromObject(s);
12602 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012603 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012604 if (sep != NULL) {
12605 sep = PyUnicode_FromObject(sep);
12606 if (sep == NULL) {
12607 Py_DECREF(s);
12608 return NULL;
12609 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012610 }
12611
Victor Stinner9310abb2011-10-05 00:59:23 +020012612 result = split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012613
12614 Py_DECREF(s);
12615 Py_XDECREF(sep);
12616 return result;
12617}
12618
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012619PyDoc_STRVAR(split__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012620 "S.split(sep=None, maxsplit=-1) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012621\n\
12622Return a list of the words in S, using sep as the\n\
12623delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000012624splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000012625whitespace string is a separator and empty strings are\n\
12626removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012627
12628static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012629unicode_split(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012630{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012631 static char *kwlist[] = {"sep", "maxsplit", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000012632 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012633 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012634
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012635 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:split",
12636 kwlist, &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012637 return NULL;
12638
12639 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012640 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012641 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012642 return split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012643 else
Victor Stinner7931d9a2011-11-04 00:22:48 +010012644 return PyUnicode_Split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012645}
12646
Thomas Wouters477c8d52006-05-27 19:21:47 +000012647PyObject *
12648PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
12649{
12650 PyObject* str_obj;
12651 PyObject* sep_obj;
12652 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012653 int kind1, kind2, kind;
12654 void *buf1 = NULL, *buf2 = NULL;
12655 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012656
12657 str_obj = PyUnicode_FromObject(str_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012658 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012659 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012660 sep_obj = PyUnicode_FromObject(sep_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012661 if (!sep_obj) {
12662 Py_DECREF(str_obj);
12663 return NULL;
12664 }
12665 if (PyUnicode_READY(sep_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
12666 Py_DECREF(sep_obj);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012667 Py_DECREF(str_obj);
12668 return NULL;
12669 }
12670
Victor Stinner14f8f022011-10-05 20:58:25 +020012671 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012672 kind2 = PyUnicode_KIND(sep_obj);
Victor Stinner14f8f022011-10-05 20:58:25 +020012673 kind = Py_MAX(kind1, kind2);
12674 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012675 if (kind1 != kind)
Victor Stinner14f8f022011-10-05 20:58:25 +020012676 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012677 if (!buf1)
12678 goto onError;
12679 buf2 = PyUnicode_DATA(sep_obj);
12680 if (kind2 != kind)
12681 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12682 if (!buf2)
12683 goto onError;
12684 len1 = PyUnicode_GET_LENGTH(str_obj);
12685 len2 = PyUnicode_GET_LENGTH(sep_obj);
12686
Benjamin Petersonead6b532011-12-20 17:23:42 -060012687 switch (PyUnicode_KIND(str_obj)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012688 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012689 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12690 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12691 else
12692 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012693 break;
12694 case PyUnicode_2BYTE_KIND:
12695 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12696 break;
12697 case PyUnicode_4BYTE_KIND:
12698 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12699 break;
12700 default:
12701 assert(0);
12702 out = 0;
12703 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012704
12705 Py_DECREF(sep_obj);
12706 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012707 if (kind1 != kind)
12708 PyMem_Free(buf1);
12709 if (kind2 != kind)
12710 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012711
12712 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012713 onError:
12714 Py_DECREF(sep_obj);
12715 Py_DECREF(str_obj);
12716 if (kind1 != kind && buf1)
12717 PyMem_Free(buf1);
12718 if (kind2 != kind && buf2)
12719 PyMem_Free(buf2);
12720 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012721}
12722
12723
12724PyObject *
12725PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
12726{
12727 PyObject* str_obj;
12728 PyObject* sep_obj;
12729 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012730 int kind1, kind2, kind;
12731 void *buf1 = NULL, *buf2 = NULL;
12732 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012733
12734 str_obj = PyUnicode_FromObject(str_in);
12735 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012736 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012737 sep_obj = PyUnicode_FromObject(sep_in);
12738 if (!sep_obj) {
12739 Py_DECREF(str_obj);
12740 return NULL;
12741 }
12742
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012743 kind1 = PyUnicode_KIND(str_in);
12744 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020012745 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012746 buf1 = PyUnicode_DATA(str_in);
12747 if (kind1 != kind)
12748 buf1 = _PyUnicode_AsKind(str_in, kind);
12749 if (!buf1)
12750 goto onError;
12751 buf2 = PyUnicode_DATA(sep_obj);
12752 if (kind2 != kind)
12753 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12754 if (!buf2)
12755 goto onError;
12756 len1 = PyUnicode_GET_LENGTH(str_obj);
12757 len2 = PyUnicode_GET_LENGTH(sep_obj);
12758
Benjamin Petersonead6b532011-12-20 17:23:42 -060012759 switch (PyUnicode_KIND(str_in)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012760 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012761 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12762 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12763 else
12764 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012765 break;
12766 case PyUnicode_2BYTE_KIND:
12767 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12768 break;
12769 case PyUnicode_4BYTE_KIND:
12770 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12771 break;
12772 default:
12773 assert(0);
12774 out = 0;
12775 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012776
12777 Py_DECREF(sep_obj);
12778 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012779 if (kind1 != kind)
12780 PyMem_Free(buf1);
12781 if (kind2 != kind)
12782 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012783
12784 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012785 onError:
12786 Py_DECREF(sep_obj);
12787 Py_DECREF(str_obj);
12788 if (kind1 != kind && buf1)
12789 PyMem_Free(buf1);
12790 if (kind2 != kind && buf2)
12791 PyMem_Free(buf2);
12792 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012793}
12794
12795PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012796 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012797\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012798Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012799the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012800found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012801
12802static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012803unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012804{
Victor Stinner9310abb2011-10-05 00:59:23 +020012805 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012806}
12807
12808PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000012809 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012810\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012811Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012812the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012813separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012814
12815static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012816unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012817{
Victor Stinner9310abb2011-10-05 00:59:23 +020012818 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012819}
12820
Alexander Belopolsky40018472011-02-26 01:02:56 +000012821PyObject *
12822PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012823{
12824 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012825
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012826 s = PyUnicode_FromObject(s);
12827 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012828 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012829 if (sep != NULL) {
12830 sep = PyUnicode_FromObject(sep);
12831 if (sep == NULL) {
12832 Py_DECREF(s);
12833 return NULL;
12834 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012835 }
12836
Victor Stinner9310abb2011-10-05 00:59:23 +020012837 result = rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012838
12839 Py_DECREF(s);
12840 Py_XDECREF(sep);
12841 return result;
12842}
12843
12844PyDoc_STRVAR(rsplit__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012845 "S.rsplit(sep=None, maxsplit=-1) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012846\n\
12847Return a list of the words in S, using sep as the\n\
12848delimiter string, starting at the end of the string and\n\
12849working to the front. If maxsplit is given, at most maxsplit\n\
12850splits are done. If sep is not specified, any whitespace string\n\
12851is a separator.");
12852
12853static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012854unicode_rsplit(PyObject *self, PyObject *args, PyObject *kwds)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012855{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012856 static char *kwlist[] = {"sep", "maxsplit", 0};
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012857 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012858 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012859
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012860 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:rsplit",
12861 kwlist, &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012862 return NULL;
12863
12864 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012865 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012866 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012867 return rsplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012868 else
Victor Stinner9310abb2011-10-05 00:59:23 +020012869 return PyUnicode_RSplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012870}
12871
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012872PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012873 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012874\n\
12875Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000012876Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012877is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012878
12879static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012880unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012881{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012882 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000012883 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012884
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012885 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12886 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012887 return NULL;
12888
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012889 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012890}
12891
12892static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000012893PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012894{
Victor Stinnerc4b49542011-12-11 22:44:26 +010012895 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012896}
12897
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012898PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012899 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012900\n\
12901Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012902and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012903
12904static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012905unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012906{
Benjamin Petersoneea48462012-01-16 14:28:50 -050012907 if (PyUnicode_READY(self) == -1)
12908 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012909 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012910}
12911
Larry Hastings61272b72014-01-07 12:41:53 -080012912/*[clinic input]
Georg Brandlceee0772007-11-27 23:48:05 +000012913
Larry Hastings31826802013-10-19 00:09:25 -070012914@staticmethod
12915str.maketrans as unicode_maketrans
12916
12917 x: object
12918
12919 y: unicode=NULL
12920
12921 z: unicode=NULL
12922
12923 /
12924
12925Return a translation table usable for str.translate().
12926
12927If there is only one argument, it must be a dictionary mapping Unicode
12928ordinals (integers) or characters to Unicode ordinals, strings or None.
12929Character keys will be then converted to ordinals.
12930If there are two arguments, they must be strings of equal length, and
12931in the resulting dictionary, each character in x will be mapped to the
12932character at the same position in y. If there is a third argument, it
12933must be a string, whose characters will be mapped to None in the result.
Larry Hastings61272b72014-01-07 12:41:53 -080012934[clinic start generated code]*/
Larry Hastings31826802013-10-19 00:09:25 -070012935
12936PyDoc_STRVAR(unicode_maketrans__doc__,
Larry Hastings2623c8c2014-02-08 22:15:29 -080012937"maketrans(x, y=None, z=None, /)\n"
12938"--\n"
12939"\n"
Larry Hastings31826802013-10-19 00:09:25 -070012940"Return a translation table usable for str.translate().\n"
12941"\n"
Larry Hastings31826802013-10-19 00:09:25 -070012942"If there is only one argument, it must be a dictionary mapping Unicode\n"
12943"ordinals (integers) or characters to Unicode ordinals, strings or None.\n"
12944"Character keys will be then converted to ordinals.\n"
12945"If there are two arguments, they must be strings of equal length, and\n"
12946"in the resulting dictionary, each character in x will be mapped to the\n"
12947"character at the same position in y. If there is a third argument, it\n"
12948"must be a string, whose characters will be mapped to None in the result.");
12949
12950#define UNICODE_MAKETRANS_METHODDEF \
12951 {"maketrans", (PyCFunction)unicode_maketrans, METH_VARARGS|METH_STATIC, unicode_maketrans__doc__},
12952
12953static PyObject *
Larry Hastings5c661892014-01-24 06:17:25 -080012954unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z);
Larry Hastings31826802013-10-19 00:09:25 -070012955
12956static PyObject *
Larry Hastingsebdcb502013-11-23 14:54:00 -080012957unicode_maketrans(void *null, PyObject *args)
Georg Brandlceee0772007-11-27 23:48:05 +000012958{
Larry Hastings31826802013-10-19 00:09:25 -070012959 PyObject *return_value = NULL;
12960 PyObject *x;
12961 PyObject *y = NULL;
12962 PyObject *z = NULL;
12963
12964 if (!PyArg_ParseTuple(args,
12965 "O|UU:maketrans",
12966 &x, &y, &z))
12967 goto exit;
Larry Hastings5c661892014-01-24 06:17:25 -080012968 return_value = unicode_maketrans_impl(x, y, z);
Larry Hastings31826802013-10-19 00:09:25 -070012969
12970exit:
12971 return return_value;
12972}
12973
12974static PyObject *
Larry Hastings5c661892014-01-24 06:17:25 -080012975unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
Larry Hastings2623c8c2014-02-08 22:15:29 -080012976/*[clinic end generated code: output=566edf630f77436a input=7bfbf529a293c6c5]*/
Larry Hastings31826802013-10-19 00:09:25 -070012977{
Georg Brandlceee0772007-11-27 23:48:05 +000012978 PyObject *new = NULL, *key, *value;
12979 Py_ssize_t i = 0;
12980 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012981
Georg Brandlceee0772007-11-27 23:48:05 +000012982 new = PyDict_New();
12983 if (!new)
12984 return NULL;
12985 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012986 int x_kind, y_kind, z_kind;
12987 void *x_data, *y_data, *z_data;
12988
Georg Brandlceee0772007-11-27 23:48:05 +000012989 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000012990 if (!PyUnicode_Check(x)) {
12991 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12992 "be a string if there is a second argument");
12993 goto err;
12994 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012995 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012996 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12997 "arguments must have equal length");
12998 goto err;
12999 }
13000 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013001 x_kind = PyUnicode_KIND(x);
13002 y_kind = PyUnicode_KIND(y);
13003 x_data = PyUnicode_DATA(x);
13004 y_data = PyUnicode_DATA(y);
13005 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13006 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013007 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000013008 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060013009 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013010 if (!value) {
13011 Py_DECREF(key);
13012 goto err;
13013 }
Georg Brandlceee0772007-11-27 23:48:05 +000013014 res = PyDict_SetItem(new, key, value);
13015 Py_DECREF(key);
13016 Py_DECREF(value);
13017 if (res < 0)
13018 goto err;
13019 }
13020 /* create entries for deleting chars in z */
13021 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013022 z_kind = PyUnicode_KIND(z);
13023 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013024 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013025 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000013026 if (!key)
13027 goto err;
13028 res = PyDict_SetItem(new, key, Py_None);
13029 Py_DECREF(key);
13030 if (res < 0)
13031 goto err;
13032 }
13033 }
13034 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013035 int kind;
13036 void *data;
13037
Georg Brandlceee0772007-11-27 23:48:05 +000013038 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000013039 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013040 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13041 "to maketrans it must be a dict");
13042 goto err;
13043 }
13044 /* copy entries into the new dict, converting string keys to int keys */
13045 while (PyDict_Next(x, &i, &key, &value)) {
13046 if (PyUnicode_Check(key)) {
13047 /* convert string keys to integer keys */
13048 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013049 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000013050 PyErr_SetString(PyExc_ValueError, "string keys in translate "
13051 "table must be of length 1");
13052 goto err;
13053 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013054 kind = PyUnicode_KIND(key);
13055 data = PyUnicode_DATA(key);
13056 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000013057 if (!newkey)
13058 goto err;
13059 res = PyDict_SetItem(new, newkey, value);
13060 Py_DECREF(newkey);
13061 if (res < 0)
13062 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000013063 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013064 /* just keep integer keys */
13065 if (PyDict_SetItem(new, key, value) < 0)
13066 goto err;
13067 } else {
13068 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13069 "be strings or integers");
13070 goto err;
13071 }
13072 }
13073 }
13074 return new;
13075 err:
13076 Py_DECREF(new);
13077 return NULL;
13078}
13079
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013080PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013081 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013082\n\
13083Return a copy of the string S, where all characters have been mapped\n\
13084through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000013085Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000013086Unmapped characters are left untouched. Characters mapped to None\n\
13087are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013088
13089static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013090unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013091{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013092 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013093}
13094
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013095PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013096 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013097\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013098Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013099
13100static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020013101unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013102{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050013103 if (PyUnicode_READY(self) == -1)
13104 return NULL;
13105 if (PyUnicode_IS_ASCII(self))
13106 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013107 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013108}
13109
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013110PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013111 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013112\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000013113Pad a numeric string S with zeros on the left, to fill a field\n\
13114of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013115
13116static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020013117unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013118{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013119 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020013120 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013121 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013122 int kind;
13123 void *data;
13124 Py_UCS4 chr;
13125
Martin v. Löwis18e16552006-02-15 17:27:45 +000013126 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000013127 return NULL;
13128
Benjamin Petersonbac79492012-01-14 13:34:47 -050013129 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010013130 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013131
Victor Stinnerc4b49542011-12-11 22:44:26 +010013132 if (PyUnicode_GET_LENGTH(self) >= width)
13133 return unicode_result_unchanged(self);
13134
13135 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013136
13137 u = pad(self, fill, 0, '0');
13138
Walter Dörwald068325e2002-04-15 13:36:47 +000013139 if (u == NULL)
13140 return NULL;
13141
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013142 kind = PyUnicode_KIND(u);
13143 data = PyUnicode_DATA(u);
13144 chr = PyUnicode_READ(kind, data, fill);
13145
13146 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000013147 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013148 PyUnicode_WRITE(kind, data, 0, chr);
13149 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000013150 }
13151
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013152 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010013153 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013154}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013155
13156#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013157static PyObject *
13158unicode__decimal2ascii(PyObject *self)
13159{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013160 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013161}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013162#endif
13163
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013164PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013165 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013166\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013167Return True if S starts with the specified prefix, False otherwise.\n\
13168With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013169With optional end, stop comparing S at that position.\n\
13170prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013171
13172static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013173unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013174 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013175{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013176 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013177 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013178 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013179 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013180 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013181
Jesus Ceaac451502011-04-20 17:09:23 +020013182 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013183 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013184 if (PyTuple_Check(subobj)) {
13185 Py_ssize_t i;
13186 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013187 substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013188 if (substring == NULL)
13189 return NULL;
13190 result = tailmatch(self, substring, start, end, -1);
13191 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010013192 if (result == -1)
13193 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013194 if (result) {
13195 Py_RETURN_TRUE;
13196 }
13197 }
13198 /* nothing matched */
13199 Py_RETURN_FALSE;
13200 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013201 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030013202 if (substring == NULL) {
13203 if (PyErr_ExceptionMatches(PyExc_TypeError))
13204 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
13205 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013206 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013207 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013208 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013209 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010013210 if (result == -1)
13211 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013212 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013213}
13214
13215
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013216PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013217 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013218\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013219Return True if S ends with the specified suffix, False otherwise.\n\
13220With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013221With optional end, stop comparing S at that position.\n\
13222suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013223
13224static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013225unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013226 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013227{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013228 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013229 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013230 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013231 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013232 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013233
Jesus Ceaac451502011-04-20 17:09:23 +020013234 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013235 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013236 if (PyTuple_Check(subobj)) {
13237 Py_ssize_t i;
13238 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013239 substring = PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000013240 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013241 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000013242 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013243 result = tailmatch(self, substring, start, end, +1);
13244 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010013245 if (result == -1)
13246 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013247 if (result) {
13248 Py_RETURN_TRUE;
13249 }
13250 }
13251 Py_RETURN_FALSE;
13252 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013253 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030013254 if (substring == NULL) {
13255 if (PyErr_ExceptionMatches(PyExc_TypeError))
13256 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
13257 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013258 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013259 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013260 result = tailmatch(self, substring, start, end, +1);
Christian Heimes305e49e2013-06-29 20:41:06 +020013261 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010013262 if (result == -1)
13263 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013264 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013265}
13266
Victor Stinner202fdca2012-05-07 12:47:02 +020013267Py_LOCAL_INLINE(void)
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013268_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013269{
Victor Stinner8f674cc2013-04-17 23:02:17 +020013270 if (!writer->readonly)
13271 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
13272 else {
13273 /* Copy-on-write mode: set buffer size to 0 so
13274 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13275 * next write. */
13276 writer->size = 0;
13277 }
Victor Stinner202fdca2012-05-07 12:47:02 +020013278 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13279 writer->data = PyUnicode_DATA(writer->buffer);
13280 writer->kind = PyUnicode_KIND(writer->buffer);
13281}
13282
Victor Stinnerd3f08822012-05-29 12:57:52 +020013283void
Victor Stinner8f674cc2013-04-17 23:02:17 +020013284_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013285{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013286 memset(writer, 0, sizeof(*writer));
13287#ifdef Py_DEBUG
13288 writer->kind = 5; /* invalid kind */
13289#endif
Victor Stinner8f674cc2013-04-17 23:02:17 +020013290 writer->min_char = 127;
Victor Stinner202fdca2012-05-07 12:47:02 +020013291}
13292
Victor Stinnerd3f08822012-05-29 12:57:52 +020013293int
13294_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13295 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020013296{
Victor Stinner6989ba02013-11-18 21:08:39 +010013297#ifdef MS_WINDOWS
13298 /* On Windows, overallocate by 50% is the best factor */
13299# define OVERALLOCATE_FACTOR 2
13300#else
13301 /* On Linux, overallocate by 25% is the best factor */
13302# define OVERALLOCATE_FACTOR 4
13303#endif
Victor Stinner202fdca2012-05-07 12:47:02 +020013304 Py_ssize_t newlen;
13305 PyObject *newbuffer;
13306
Victor Stinnerd3f08822012-05-29 12:57:52 +020013307 assert(length > 0);
13308
Victor Stinner202fdca2012-05-07 12:47:02 +020013309 if (length > PY_SSIZE_T_MAX - writer->pos) {
13310 PyErr_NoMemory();
13311 return -1;
13312 }
13313 newlen = writer->pos + length;
13314
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070013315 maxchar = Py_MAX(maxchar, writer->min_char);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013316
Victor Stinnerd3f08822012-05-29 12:57:52 +020013317 if (writer->buffer == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020013318 assert(!writer->readonly);
Victor Stinner6989ba02013-11-18 21:08:39 +010013319 if (writer->overallocate
13320 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13321 /* overallocate to limit the number of realloc() */
13322 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013323 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013324 if (newlen < writer->min_length)
13325 newlen = writer->min_length;
13326
Victor Stinnerd3f08822012-05-29 12:57:52 +020013327 writer->buffer = PyUnicode_New(newlen, maxchar);
13328 if (writer->buffer == NULL)
13329 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013330 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013331 else if (newlen > writer->size) {
Victor Stinner6989ba02013-11-18 21:08:39 +010013332 if (writer->overallocate
13333 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13334 /* overallocate to limit the number of realloc() */
13335 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013336 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013337 if (newlen < writer->min_length)
13338 newlen = writer->min_length;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013339
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013340 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020013341 /* resize + widen */
13342 newbuffer = PyUnicode_New(newlen, maxchar);
13343 if (newbuffer == NULL)
13344 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013345 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13346 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020013347 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013348 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020013349 }
13350 else {
13351 newbuffer = resize_compact(writer->buffer, newlen);
13352 if (newbuffer == NULL)
13353 return -1;
13354 }
13355 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013356 }
13357 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013358 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013359 newbuffer = PyUnicode_New(writer->size, maxchar);
13360 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020013361 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013362 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13363 writer->buffer, 0, writer->pos);
13364 Py_DECREF(writer->buffer);
13365 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013366 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013367 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013368 return 0;
Victor Stinner6989ba02013-11-18 21:08:39 +010013369
13370#undef OVERALLOCATE_FACTOR
Victor Stinner202fdca2012-05-07 12:47:02 +020013371}
13372
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013373Py_LOCAL_INLINE(int)
13374_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
Victor Stinnera0dd0212013-04-11 22:09:04 +020013375{
13376 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13377 return -1;
13378 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13379 writer->pos++;
13380 return 0;
13381}
13382
13383int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013384_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13385{
13386 return _PyUnicodeWriter_WriteCharInline(writer, ch);
13387}
13388
13389int
Victor Stinnerd3f08822012-05-29 12:57:52 +020013390_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13391{
13392 Py_UCS4 maxchar;
13393 Py_ssize_t len;
13394
13395 if (PyUnicode_READY(str) == -1)
13396 return -1;
13397 len = PyUnicode_GET_LENGTH(str);
13398 if (len == 0)
13399 return 0;
13400 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13401 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013402 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020013403 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013404 Py_INCREF(str);
13405 writer->buffer = str;
13406 _PyUnicodeWriter_Update(writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013407 writer->pos += len;
13408 return 0;
13409 }
13410 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13411 return -1;
13412 }
13413 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13414 str, 0, len);
13415 writer->pos += len;
13416 return 0;
13417}
13418
Victor Stinnere215d962012-10-06 23:03:36 +020013419int
Victor Stinnercfc4c132013-04-03 01:48:39 +020013420_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13421 Py_ssize_t start, Py_ssize_t end)
13422{
13423 Py_UCS4 maxchar;
13424 Py_ssize_t len;
13425
13426 if (PyUnicode_READY(str) == -1)
13427 return -1;
13428
13429 assert(0 <= start);
13430 assert(end <= PyUnicode_GET_LENGTH(str));
13431 assert(start <= end);
13432
13433 if (end == 0)
13434 return 0;
13435
13436 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
13437 return _PyUnicodeWriter_WriteStr(writer, str);
13438
13439 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
13440 maxchar = _PyUnicode_FindMaxChar(str, start, end);
13441 else
13442 maxchar = writer->maxchar;
13443 len = end - start;
13444
13445 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
13446 return -1;
13447
13448 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13449 str, start, len);
13450 writer->pos += len;
13451 return 0;
13452}
13453
13454int
Victor Stinner4a587072013-11-19 12:54:53 +010013455_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
13456 const char *ascii, Py_ssize_t len)
13457{
13458 if (len == -1)
13459 len = strlen(ascii);
13460
13461 assert(ucs1lib_find_max_char((Py_UCS1*)ascii, (Py_UCS1*)ascii + len) < 128);
13462
13463 if (writer->buffer == NULL && !writer->overallocate) {
13464 PyObject *str;
13465
13466 str = _PyUnicode_FromASCII(ascii, len);
13467 if (str == NULL)
13468 return -1;
13469
13470 writer->readonly = 1;
13471 writer->buffer = str;
13472 _PyUnicodeWriter_Update(writer);
13473 writer->pos += len;
13474 return 0;
13475 }
13476
13477 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
13478 return -1;
13479
13480 switch (writer->kind)
13481 {
13482 case PyUnicode_1BYTE_KIND:
13483 {
13484 const Py_UCS1 *str = (const Py_UCS1 *)ascii;
13485 Py_UCS1 *data = writer->data;
13486
13487 Py_MEMCPY(data + writer->pos, str, len);
13488 break;
13489 }
13490 case PyUnicode_2BYTE_KIND:
13491 {
13492 _PyUnicode_CONVERT_BYTES(
13493 Py_UCS1, Py_UCS2,
13494 ascii, ascii + len,
13495 (Py_UCS2 *)writer->data + writer->pos);
13496 break;
13497 }
13498 case PyUnicode_4BYTE_KIND:
13499 {
13500 _PyUnicode_CONVERT_BYTES(
13501 Py_UCS1, Py_UCS4,
13502 ascii, ascii + len,
13503 (Py_UCS4 *)writer->data + writer->pos);
13504 break;
13505 }
13506 default:
13507 assert(0);
13508 }
13509
13510 writer->pos += len;
13511 return 0;
13512}
13513
13514int
13515_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
13516 const char *str, Py_ssize_t len)
Victor Stinnere215d962012-10-06 23:03:36 +020013517{
13518 Py_UCS4 maxchar;
13519
13520 maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len);
13521 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
13522 return -1;
13523 unicode_write_cstr(writer->buffer, writer->pos, str, len);
13524 writer->pos += len;
13525 return 0;
13526}
13527
Victor Stinnerd3f08822012-05-29 12:57:52 +020013528PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013529_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013530{
Victor Stinner15a0bd32013-07-08 22:29:55 +020013531 PyObject *str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013532 if (writer->pos == 0) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013533 Py_CLEAR(writer->buffer);
Serhiy Storchaka678db842013-01-26 12:16:36 +020013534 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3f08822012-05-29 12:57:52 +020013535 }
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013536 if (writer->readonly) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013537 str = writer->buffer;
13538 writer->buffer = NULL;
13539 assert(PyUnicode_GET_LENGTH(str) == writer->pos);
13540 return str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013541 }
13542 if (PyUnicode_GET_LENGTH(writer->buffer) != writer->pos) {
13543 PyObject *newbuffer;
13544 newbuffer = resize_compact(writer->buffer, writer->pos);
13545 if (newbuffer == NULL) {
Serhiy Storchakadfe98a12014-02-09 13:46:20 +020013546 Py_CLEAR(writer->buffer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013547 return NULL;
13548 }
13549 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013550 }
Victor Stinner15a0bd32013-07-08 22:29:55 +020013551 str = writer->buffer;
13552 writer->buffer = NULL;
13553 assert(_PyUnicode_CheckConsistency(str, 1));
13554 return unicode_result_ready(str);
Victor Stinner202fdca2012-05-07 12:47:02 +020013555}
13556
Victor Stinnerd3f08822012-05-29 12:57:52 +020013557void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013558_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013559{
13560 Py_CLEAR(writer->buffer);
13561}
13562
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013563#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000013564
13565PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013566 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013567\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013568Return a formatted version of S, using substitutions from args and kwargs.\n\
13569The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000013570
Eric Smith27bbca62010-11-04 17:06:58 +000013571PyDoc_STRVAR(format_map__doc__,
13572 "S.format_map(mapping) -> str\n\
13573\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013574Return a formatted version of S, using substitutions from mapping.\n\
13575The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000013576
Eric Smith4a7d76d2008-05-30 18:10:19 +000013577static PyObject *
13578unicode__format__(PyObject* self, PyObject* args)
13579{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013580 PyObject *format_spec;
13581 _PyUnicodeWriter writer;
13582 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000013583
13584 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
13585 return NULL;
13586
Victor Stinnerd3f08822012-05-29 12:57:52 +020013587 if (PyUnicode_READY(self) == -1)
13588 return NULL;
Victor Stinner8f674cc2013-04-17 23:02:17 +020013589 _PyUnicodeWriter_Init(&writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013590 ret = _PyUnicode_FormatAdvancedWriter(&writer,
13591 self, format_spec, 0,
13592 PyUnicode_GET_LENGTH(format_spec));
13593 if (ret == -1) {
13594 _PyUnicodeWriter_Dealloc(&writer);
13595 return NULL;
13596 }
13597 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000013598}
13599
Eric Smith8c663262007-08-25 02:26:07 +000013600PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013601 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013602\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013603Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000013604
13605static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013606unicode__sizeof__(PyObject *v)
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013607{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013608 Py_ssize_t size;
13609
13610 /* If it's a compact object, account for base structure +
13611 character data. */
13612 if (PyUnicode_IS_COMPACT_ASCII(v))
13613 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
13614 else if (PyUnicode_IS_COMPACT(v))
13615 size = sizeof(PyCompactUnicodeObject) +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013616 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013617 else {
13618 /* If it is a two-block object, account for base object, and
13619 for character block if present. */
13620 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020013621 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013622 size += (PyUnicode_GET_LENGTH(v) + 1) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013623 PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013624 }
13625 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020013626 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020013627 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013628 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020013629 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020013630 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013631
13632 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013633}
13634
13635PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013636 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013637
13638static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020013639unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013640{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010013641 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013642 if (!copy)
13643 return NULL;
13644 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013645}
13646
Guido van Rossumd57fd912000-03-10 22:53:23 +000013647static PyMethodDef unicode_methods[] = {
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000013648 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013649 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
Ezio Melotticda6b6d2012-02-26 09:39:55 +020013650 {"split", (PyCFunction) unicode_split, METH_VARARGS | METH_KEYWORDS, split__doc__},
13651 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS | METH_KEYWORDS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013652 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
13653 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
Benjamin Petersond5890c82012-01-14 13:23:30 -050013654 {"casefold", (PyCFunction) unicode_casefold, METH_NOARGS, casefold__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013655 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
13656 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
13657 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
Ezio Melotti745d54d2013-11-16 19:10:57 +020013658 {"expandtabs", (PyCFunction) unicode_expandtabs,
13659 METH_VARARGS | METH_KEYWORDS, expandtabs__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013660 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013661 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013662 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
13663 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
13664 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013665 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013666 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13667 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
13668 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013669 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013670 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Ezio Melotti745d54d2013-11-16 19:10:57 +020013671 {"splitlines", (PyCFunction) unicode_splitlines,
13672 METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013673 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013674 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
13675 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
13676 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
13677 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13678 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
13679 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
13680 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
13681 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
13682 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
13683 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
13684 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
13685 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
13686 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
13687 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000013688 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000013689 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013690 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000013691 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000013692 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000013693 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Larry Hastings31826802013-10-19 00:09:25 -070013694 UNICODE_MAKETRANS_METHODDEF
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013695 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000013696#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013697 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013698 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013699#endif
13700
Benjamin Peterson14339b62009-01-31 16:36:08 +000013701 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013702 {NULL, NULL}
13703};
13704
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013705static PyObject *
13706unicode_mod(PyObject *v, PyObject *w)
13707{
Brian Curtindfc80e32011-08-10 20:28:54 -050013708 if (!PyUnicode_Check(v))
13709 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000013710 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013711}
13712
13713static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013714 0, /*nb_add*/
13715 0, /*nb_subtract*/
13716 0, /*nb_multiply*/
13717 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013718};
13719
Guido van Rossumd57fd912000-03-10 22:53:23 +000013720static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013721 (lenfunc) unicode_length, /* sq_length */
13722 PyUnicode_Concat, /* sq_concat */
13723 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13724 (ssizeargfunc) unicode_getitem, /* sq_item */
13725 0, /* sq_slice */
13726 0, /* sq_ass_item */
13727 0, /* sq_ass_slice */
13728 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013729};
13730
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013731static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013732unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013733{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013734 if (PyUnicode_READY(self) == -1)
13735 return NULL;
13736
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013737 if (PyIndex_Check(item)) {
13738 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013739 if (i == -1 && PyErr_Occurred())
13740 return NULL;
13741 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013742 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013743 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013744 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000013745 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013746 PyObject *result;
13747 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013748 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013749 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013750
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013751 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000013752 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013753 return NULL;
13754 }
13755
13756 if (slicelength <= 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020013757 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013758 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010013759 slicelength == PyUnicode_GET_LENGTH(self)) {
13760 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000013761 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013762 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013763 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013764 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013765 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013766 src_kind = PyUnicode_KIND(self);
13767 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020013768 if (!PyUnicode_IS_ASCII(self)) {
13769 kind_limit = kind_maxchar_limit(src_kind);
13770 max_char = 0;
13771 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13772 ch = PyUnicode_READ(src_kind, src_data, cur);
13773 if (ch > max_char) {
13774 max_char = ch;
13775 if (max_char >= kind_limit)
13776 break;
13777 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013778 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013779 }
Victor Stinner55c99112011-10-13 01:17:06 +020013780 else
13781 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013782 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013783 if (result == NULL)
13784 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013785 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013786 dest_data = PyUnicode_DATA(result);
13787
13788 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013789 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13790 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013791 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013792 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013793 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013794 } else {
13795 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13796 return NULL;
13797 }
13798}
13799
13800static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013801 (lenfunc)unicode_length, /* mp_length */
13802 (binaryfunc)unicode_subscript, /* mp_subscript */
13803 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013804};
13805
Guido van Rossumd57fd912000-03-10 22:53:23 +000013806
Guido van Rossumd57fd912000-03-10 22:53:23 +000013807/* Helpers for PyUnicode_Format() */
13808
Victor Stinnera47082312012-10-04 02:19:54 +020013809struct unicode_formatter_t {
13810 PyObject *args;
13811 int args_owned;
13812 Py_ssize_t arglen, argidx;
13813 PyObject *dict;
13814
13815 enum PyUnicode_Kind fmtkind;
13816 Py_ssize_t fmtcnt, fmtpos;
13817 void *fmtdata;
13818 PyObject *fmtstr;
13819
13820 _PyUnicodeWriter writer;
13821};
13822
13823struct unicode_format_arg_t {
13824 Py_UCS4 ch;
13825 int flags;
13826 Py_ssize_t width;
13827 int prec;
13828 int sign;
13829};
13830
Guido van Rossumd57fd912000-03-10 22:53:23 +000013831static PyObject *
Victor Stinnera47082312012-10-04 02:19:54 +020013832unicode_format_getnextarg(struct unicode_formatter_t *ctx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013833{
Victor Stinnera47082312012-10-04 02:19:54 +020013834 Py_ssize_t argidx = ctx->argidx;
13835
13836 if (argidx < ctx->arglen) {
13837 ctx->argidx++;
13838 if (ctx->arglen < 0)
13839 return ctx->args;
Benjamin Peterson29060642009-01-31 22:14:21 +000013840 else
Victor Stinnera47082312012-10-04 02:19:54 +020013841 return PyTuple_GetItem(ctx->args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013842 }
13843 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013844 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013845 return NULL;
13846}
13847
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013848/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013849
Victor Stinnera47082312012-10-04 02:19:54 +020013850/* Format a float into the writer if the writer is not NULL, or into *p_output
13851 otherwise.
13852
13853 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +020013854static int
Victor Stinnera47082312012-10-04 02:19:54 +020013855formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
13856 PyObject **p_output,
13857 _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013858{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013859 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013860 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013861 Py_ssize_t len;
Victor Stinnera47082312012-10-04 02:19:54 +020013862 int prec;
13863 int dtoa_flags;
Tim Petersced69f82003-09-16 20:30:58 +000013864
Guido van Rossumd57fd912000-03-10 22:53:23 +000013865 x = PyFloat_AsDouble(v);
13866 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020013867 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013868
Victor Stinnera47082312012-10-04 02:19:54 +020013869 prec = arg->prec;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013870 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013871 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000013872
Victor Stinnera47082312012-10-04 02:19:54 +020013873 if (arg->flags & F_ALT)
13874 dtoa_flags = Py_DTSF_ALT;
13875 else
13876 dtoa_flags = 0;
13877 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013878 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020013879 return -1;
13880 len = strlen(p);
13881 if (writer) {
Victor Stinner4a587072013-11-19 12:54:53 +010013882 if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
Christian Heimesf4f99392012-09-10 11:48:41 +020013883 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013884 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020013885 }
Victor Stinnerd3f08822012-05-29 12:57:52 +020013886 }
13887 else
13888 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000013889 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013890 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013891}
13892
Victor Stinnerd0880d52012-04-27 23:40:13 +020013893/* formatlong() emulates the format codes d, u, o, x and X, and
13894 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
13895 * Python's regular ints.
13896 * Return value: a new PyUnicodeObject*, or NULL if error.
13897 * The output string is of the form
13898 * "-"? ("0x" | "0X")? digit+
13899 * "0x"/"0X" are present only for x and X conversions, with F_ALT
13900 * set in flags. The case of hex digits will be correct,
13901 * There will be at least prec digits, zero-filled on the left if
13902 * necessary to get that many.
13903 * val object to be converted
13904 * flags bitmask of format flags; only F_ALT is looked at
13905 * prec minimum number of digits; 0-fill on left if needed
13906 * type a character in [duoxX]; u acts the same as d
13907 *
13908 * CAUTION: o, x and X conversions on regular ints can never
13909 * produce a '-' sign, but can for Python's unbounded ints.
13910 */
Tim Peters38fd5b62000-09-21 05:43:11 +000013911static PyObject*
Victor Stinnera47082312012-10-04 02:19:54 +020013912formatlong(PyObject *val, struct unicode_format_arg_t *arg)
Tim Peters38fd5b62000-09-21 05:43:11 +000013913{
Victor Stinnerd0880d52012-04-27 23:40:13 +020013914 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013915 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013916 Py_ssize_t i;
13917 int sign; /* 1 if '-', else 0 */
13918 int len; /* number of characters */
13919 Py_ssize_t llen;
13920 int numdigits; /* len == numnondigits + numdigits */
13921 int numnondigits = 0;
Victor Stinnera47082312012-10-04 02:19:54 +020013922 int prec = arg->prec;
13923 int type = arg->ch;
Tim Peters38fd5b62000-09-21 05:43:11 +000013924
Victor Stinnerd0880d52012-04-27 23:40:13 +020013925 /* Avoid exceeding SSIZE_T_MAX */
13926 if (prec > INT_MAX-3) {
13927 PyErr_SetString(PyExc_OverflowError,
13928 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000013929 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013930 }
13931
13932 assert(PyLong_Check(val));
13933
13934 switch (type) {
Victor Stinner621ef3d2012-10-02 00:33:47 +020013935 default:
13936 assert(!"'type' not in [diuoxX]");
Victor Stinnerd0880d52012-04-27 23:40:13 +020013937 case 'd':
Victor Stinner621ef3d2012-10-02 00:33:47 +020013938 case 'i':
Victor Stinnerd0880d52012-04-27 23:40:13 +020013939 case 'u':
Ethan Furmanfb137212013-08-31 10:18:55 -070013940 /* int and int subclasses should print numerically when a numeric */
13941 /* format code is used (see issue18780) */
13942 result = PyNumber_ToBase(val, 10);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013943 break;
13944 case 'o':
13945 numnondigits = 2;
13946 result = PyNumber_ToBase(val, 8);
13947 break;
13948 case 'x':
13949 case 'X':
13950 numnondigits = 2;
13951 result = PyNumber_ToBase(val, 16);
13952 break;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013953 }
13954 if (!result)
13955 return NULL;
13956
13957 assert(unicode_modifiable(result));
13958 assert(PyUnicode_IS_READY(result));
13959 assert(PyUnicode_IS_ASCII(result));
13960
13961 /* To modify the string in-place, there can only be one reference. */
13962 if (Py_REFCNT(result) != 1) {
Christian Heimesd47802e2013-06-29 21:33:36 +020013963 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013964 PyErr_BadInternalCall();
13965 return NULL;
13966 }
13967 buf = PyUnicode_DATA(result);
13968 llen = PyUnicode_GET_LENGTH(result);
13969 if (llen > INT_MAX) {
Christian Heimesd47802e2013-06-29 21:33:36 +020013970 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013971 PyErr_SetString(PyExc_ValueError,
13972 "string too large in _PyBytes_FormatLong");
13973 return NULL;
13974 }
13975 len = (int)llen;
13976 sign = buf[0] == '-';
13977 numnondigits += sign;
13978 numdigits = len - numnondigits;
13979 assert(numdigits > 0);
13980
13981 /* Get rid of base marker unless F_ALT */
Victor Stinnera47082312012-10-04 02:19:54 +020013982 if (((arg->flags & F_ALT) == 0 &&
Victor Stinnerd0880d52012-04-27 23:40:13 +020013983 (type == 'o' || type == 'x' || type == 'X'))) {
13984 assert(buf[sign] == '0');
13985 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
13986 buf[sign+1] == 'o');
13987 numnondigits -= 2;
13988 buf += 2;
13989 len -= 2;
13990 if (sign)
13991 buf[0] = '-';
13992 assert(len == numnondigits + numdigits);
13993 assert(numdigits > 0);
13994 }
13995
13996 /* Fill with leading zeroes to meet minimum width. */
13997 if (prec > numdigits) {
13998 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
13999 numnondigits + prec);
14000 char *b1;
14001 if (!r1) {
14002 Py_DECREF(result);
14003 return NULL;
14004 }
14005 b1 = PyBytes_AS_STRING(r1);
14006 for (i = 0; i < numnondigits; ++i)
14007 *b1++ = *buf++;
14008 for (i = 0; i < prec - numdigits; i++)
14009 *b1++ = '0';
14010 for (i = 0; i < numdigits; i++)
14011 *b1++ = *buf++;
14012 *b1 = '\0';
14013 Py_DECREF(result);
14014 result = r1;
14015 buf = PyBytes_AS_STRING(result);
14016 len = numnondigits + prec;
14017 }
14018
14019 /* Fix up case for hex conversions. */
14020 if (type == 'X') {
14021 /* Need to convert all lower case letters to upper case.
14022 and need to convert 0x to 0X (and -0x to -0X). */
14023 for (i = 0; i < len; i++)
14024 if (buf[i] >= 'a' && buf[i] <= 'x')
14025 buf[i] -= 'a'-'A';
14026 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014027 if (!PyUnicode_Check(result)
14028 || buf != PyUnicode_DATA(result)) {
Victor Stinnerd0880d52012-04-27 23:40:13 +020014029 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014030 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014031 Py_DECREF(result);
14032 result = unicode;
14033 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014034 else if (len != PyUnicode_GET_LENGTH(result)) {
14035 if (PyUnicode_Resize(&result, len) < 0)
14036 Py_CLEAR(result);
14037 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000014038 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000014039}
14040
Ethan Furmandf3ed242014-01-05 06:50:30 -080014041/* Format an integer or a float as an integer.
Victor Stinner621ef3d2012-10-02 00:33:47 +020014042 * Return 1 if the number has been formatted into the writer,
Victor Stinnera47082312012-10-04 02:19:54 +020014043 * 0 if the number has been formatted into *p_output
Victor Stinner621ef3d2012-10-02 00:33:47 +020014044 * -1 and raise an exception on error */
14045static int
Victor Stinnera47082312012-10-04 02:19:54 +020014046mainformatlong(PyObject *v,
14047 struct unicode_format_arg_t *arg,
14048 PyObject **p_output,
14049 _PyUnicodeWriter *writer)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014050{
14051 PyObject *iobj, *res;
Victor Stinnera47082312012-10-04 02:19:54 +020014052 char type = (char)arg->ch;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014053
14054 if (!PyNumber_Check(v))
14055 goto wrongtype;
14056
Ethan Furmanf9bba9c2014-01-11 23:20:58 -080014057 /* make sure number is a type of integer */
Ethan Furmana70805e2014-01-12 08:42:35 -080014058 /* if not, issue deprecation warning for now */
Victor Stinner621ef3d2012-10-02 00:33:47 +020014059 if (!PyLong_Check(v)) {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014060 if (type == 'o' || type == 'x' || type == 'X') {
14061 iobj = PyNumber_Index(v);
14062 if (iobj == NULL) {
Ethan Furmanf9bba9c2014-01-11 23:20:58 -080014063 PyErr_Clear();
14064 if (PyErr_WarnEx(PyExc_DeprecationWarning,
14065 "automatic int conversions have been deprecated",
14066 1)) {
14067 return -1;
14068 }
14069 iobj = PyNumber_Long(v);
14070 if (iobj == NULL ) {
14071 if (PyErr_ExceptionMatches(PyExc_TypeError))
14072 goto wrongtype;
14073 return -1;
14074 }
Ethan Furmandf3ed242014-01-05 06:50:30 -080014075 }
14076 }
14077 else {
14078 iobj = PyNumber_Long(v);
14079 if (iobj == NULL ) {
14080 if (PyErr_ExceptionMatches(PyExc_TypeError))
14081 goto wrongtype;
14082 return -1;
14083 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014084 }
14085 assert(PyLong_Check(iobj));
14086 }
14087 else {
14088 iobj = v;
14089 Py_INCREF(iobj);
14090 }
14091
14092 if (PyLong_CheckExact(v)
Victor Stinnera47082312012-10-04 02:19:54 +020014093 && arg->width == -1 && arg->prec == -1
14094 && !(arg->flags & (F_SIGN | F_BLANK))
14095 && type != 'X')
Victor Stinner621ef3d2012-10-02 00:33:47 +020014096 {
14097 /* Fast path */
Victor Stinnera47082312012-10-04 02:19:54 +020014098 int alternate = arg->flags & F_ALT;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014099 int base;
14100
Victor Stinnera47082312012-10-04 02:19:54 +020014101 switch(type)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014102 {
14103 default:
14104 assert(0 && "'type' not in [diuoxX]");
14105 case 'd':
14106 case 'i':
14107 case 'u':
14108 base = 10;
14109 break;
14110 case 'o':
14111 base = 8;
14112 break;
14113 case 'x':
14114 case 'X':
14115 base = 16;
14116 break;
14117 }
14118
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014119 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14120 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014121 return -1;
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014122 }
14123 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014124 return 1;
14125 }
14126
Victor Stinnera47082312012-10-04 02:19:54 +020014127 res = formatlong(iobj, arg);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014128 Py_DECREF(iobj);
14129 if (res == NULL)
14130 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014131 *p_output = res;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014132 return 0;
14133
14134wrongtype:
14135 PyErr_Format(PyExc_TypeError,
14136 "%%%c format: a number is required, "
Victor Stinnera47082312012-10-04 02:19:54 +020014137 "not %.200s",
14138 type, Py_TYPE(v)->tp_name);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014139 return -1;
14140}
14141
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014142static Py_UCS4
14143formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014144{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014145 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014146 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014147 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014148 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000014149 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014150 goto onError;
14151 }
14152 else {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014153 PyObject *iobj;
Benjamin Peterson29060642009-01-31 22:14:21 +000014154 long x;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014155 /* make sure number is a type of integer */
Ethan Furmana70805e2014-01-12 08:42:35 -080014156 /* if not, issue deprecation warning for now */
Ethan Furmandf3ed242014-01-05 06:50:30 -080014157 if (!PyLong_Check(v)) {
14158 iobj = PyNumber_Index(v);
14159 if (iobj == NULL) {
Ethan Furmanf9bba9c2014-01-11 23:20:58 -080014160 PyErr_Clear();
14161 if (PyErr_WarnEx(PyExc_DeprecationWarning,
14162 "automatic int conversions have been deprecated",
14163 1)) {
14164 return -1;
14165 }
14166 iobj = PyNumber_Long(v);
14167 if (iobj == NULL ) {
14168 if (PyErr_ExceptionMatches(PyExc_TypeError))
14169 goto onError;
14170 return -1;
14171 }
Ethan Furmandf3ed242014-01-05 06:50:30 -080014172 }
14173 v = iobj;
14174 Py_DECREF(iobj);
14175 }
14176 /* Integer input truncated to a character */
Benjamin Peterson29060642009-01-31 22:14:21 +000014177 x = PyLong_AsLong(v);
14178 if (x == -1 && PyErr_Occurred())
14179 goto onError;
14180
Victor Stinner8faf8212011-12-08 22:14:11 +010014181 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014182 PyErr_SetString(PyExc_OverflowError,
14183 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014184 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000014185 }
14186
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014187 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014188 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014189
Benjamin Peterson29060642009-01-31 22:14:21 +000014190 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014191 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014192 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014193 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014194}
14195
Victor Stinnera47082312012-10-04 02:19:54 +020014196/* Parse options of an argument: flags, width, precision.
14197 Handle also "%(name)" syntax.
14198
14199 Return 0 if the argument has been formatted into arg->str.
14200 Return 1 if the argument has been written into ctx->writer,
14201 Raise an exception and return -1 on error. */
14202static int
14203unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14204 struct unicode_format_arg_t *arg)
14205{
14206#define FORMAT_READ(ctx) \
14207 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14208
14209 PyObject *v;
14210
Victor Stinnera47082312012-10-04 02:19:54 +020014211 if (arg->ch == '(') {
14212 /* Get argument value from a dictionary. Example: "%(name)s". */
14213 Py_ssize_t keystart;
14214 Py_ssize_t keylen;
14215 PyObject *key;
14216 int pcount = 1;
14217
14218 if (ctx->dict == NULL) {
14219 PyErr_SetString(PyExc_TypeError,
14220 "format requires a mapping");
14221 return -1;
14222 }
14223 ++ctx->fmtpos;
14224 --ctx->fmtcnt;
14225 keystart = ctx->fmtpos;
14226 /* Skip over balanced parentheses */
14227 while (pcount > 0 && --ctx->fmtcnt >= 0) {
14228 arg->ch = FORMAT_READ(ctx);
14229 if (arg->ch == ')')
14230 --pcount;
14231 else if (arg->ch == '(')
14232 ++pcount;
14233 ctx->fmtpos++;
14234 }
14235 keylen = ctx->fmtpos - keystart - 1;
14236 if (ctx->fmtcnt < 0 || pcount > 0) {
14237 PyErr_SetString(PyExc_ValueError,
14238 "incomplete format key");
14239 return -1;
14240 }
14241 key = PyUnicode_Substring(ctx->fmtstr,
14242 keystart, keystart + keylen);
14243 if (key == NULL)
14244 return -1;
14245 if (ctx->args_owned) {
14246 Py_DECREF(ctx->args);
14247 ctx->args_owned = 0;
14248 }
14249 ctx->args = PyObject_GetItem(ctx->dict, key);
14250 Py_DECREF(key);
14251 if (ctx->args == NULL)
14252 return -1;
14253 ctx->args_owned = 1;
14254 ctx->arglen = -1;
14255 ctx->argidx = -2;
14256 }
14257
14258 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
Victor Stinnera47082312012-10-04 02:19:54 +020014259 while (--ctx->fmtcnt >= 0) {
14260 arg->ch = FORMAT_READ(ctx);
14261 ctx->fmtpos++;
14262 switch (arg->ch) {
14263 case '-': arg->flags |= F_LJUST; continue;
14264 case '+': arg->flags |= F_SIGN; continue;
14265 case ' ': arg->flags |= F_BLANK; continue;
14266 case '#': arg->flags |= F_ALT; continue;
14267 case '0': arg->flags |= F_ZERO; continue;
14268 }
14269 break;
14270 }
14271
14272 /* Parse width. Example: "%10s" => width=10 */
Victor Stinnera47082312012-10-04 02:19:54 +020014273 if (arg->ch == '*') {
14274 v = unicode_format_getnextarg(ctx);
14275 if (v == NULL)
14276 return -1;
14277 if (!PyLong_Check(v)) {
14278 PyErr_SetString(PyExc_TypeError,
14279 "* wants int");
14280 return -1;
14281 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014282 arg->width = PyLong_AsSsize_t(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014283 if (arg->width == -1 && PyErr_Occurred())
14284 return -1;
14285 if (arg->width < 0) {
14286 arg->flags |= F_LJUST;
14287 arg->width = -arg->width;
14288 }
14289 if (--ctx->fmtcnt >= 0) {
14290 arg->ch = FORMAT_READ(ctx);
14291 ctx->fmtpos++;
14292 }
14293 }
14294 else if (arg->ch >= '0' && arg->ch <= '9') {
14295 arg->width = arg->ch - '0';
14296 while (--ctx->fmtcnt >= 0) {
14297 arg->ch = FORMAT_READ(ctx);
14298 ctx->fmtpos++;
14299 if (arg->ch < '0' || arg->ch > '9')
14300 break;
14301 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14302 mixing signed and unsigned comparison. Since arg->ch is between
14303 '0' and '9', casting to int is safe. */
14304 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14305 PyErr_SetString(PyExc_ValueError,
14306 "width too big");
14307 return -1;
14308 }
14309 arg->width = arg->width*10 + (arg->ch - '0');
14310 }
14311 }
14312
14313 /* Parse precision. Example: "%.3f" => prec=3 */
Victor Stinnera47082312012-10-04 02:19:54 +020014314 if (arg->ch == '.') {
14315 arg->prec = 0;
14316 if (--ctx->fmtcnt >= 0) {
14317 arg->ch = FORMAT_READ(ctx);
14318 ctx->fmtpos++;
14319 }
14320 if (arg->ch == '*') {
14321 v = unicode_format_getnextarg(ctx);
14322 if (v == NULL)
14323 return -1;
14324 if (!PyLong_Check(v)) {
14325 PyErr_SetString(PyExc_TypeError,
14326 "* wants int");
14327 return -1;
14328 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014329 arg->prec = _PyLong_AsInt(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014330 if (arg->prec == -1 && PyErr_Occurred())
14331 return -1;
14332 if (arg->prec < 0)
14333 arg->prec = 0;
14334 if (--ctx->fmtcnt >= 0) {
14335 arg->ch = FORMAT_READ(ctx);
14336 ctx->fmtpos++;
14337 }
14338 }
14339 else if (arg->ch >= '0' && arg->ch <= '9') {
14340 arg->prec = arg->ch - '0';
14341 while (--ctx->fmtcnt >= 0) {
14342 arg->ch = FORMAT_READ(ctx);
14343 ctx->fmtpos++;
14344 if (arg->ch < '0' || arg->ch > '9')
14345 break;
14346 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
14347 PyErr_SetString(PyExc_ValueError,
Victor Stinner3921e902012-10-06 23:05:00 +020014348 "precision too big");
Victor Stinnera47082312012-10-04 02:19:54 +020014349 return -1;
14350 }
14351 arg->prec = arg->prec*10 + (arg->ch - '0');
14352 }
14353 }
14354 }
14355
14356 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
14357 if (ctx->fmtcnt >= 0) {
14358 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
14359 if (--ctx->fmtcnt >= 0) {
14360 arg->ch = FORMAT_READ(ctx);
14361 ctx->fmtpos++;
14362 }
14363 }
14364 }
14365 if (ctx->fmtcnt < 0) {
14366 PyErr_SetString(PyExc_ValueError,
14367 "incomplete format");
14368 return -1;
14369 }
14370 return 0;
14371
14372#undef FORMAT_READ
14373}
14374
14375/* Format one argument. Supported conversion specifiers:
14376
14377 - "s", "r", "a": any type
Ethan Furmandf3ed242014-01-05 06:50:30 -080014378 - "i", "d", "u": int or float
14379 - "o", "x", "X": int
Victor Stinnera47082312012-10-04 02:19:54 +020014380 - "e", "E", "f", "F", "g", "G": float
14381 - "c": int or str (1 character)
14382
Victor Stinner8dbd4212012-12-04 09:30:24 +010014383 When possible, the output is written directly into the Unicode writer
14384 (ctx->writer). A string is created when padding is required.
14385
Victor Stinnera47082312012-10-04 02:19:54 +020014386 Return 0 if the argument has been formatted into *p_str,
14387 1 if the argument has been written into ctx->writer,
Victor Stinner8dbd4212012-12-04 09:30:24 +010014388 -1 on error. */
Victor Stinnera47082312012-10-04 02:19:54 +020014389static int
14390unicode_format_arg_format(struct unicode_formatter_t *ctx,
14391 struct unicode_format_arg_t *arg,
14392 PyObject **p_str)
14393{
14394 PyObject *v;
14395 _PyUnicodeWriter *writer = &ctx->writer;
14396
14397 if (ctx->fmtcnt == 0)
14398 ctx->writer.overallocate = 0;
14399
14400 if (arg->ch == '%') {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014401 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014402 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014403 return 1;
14404 }
14405
14406 v = unicode_format_getnextarg(ctx);
14407 if (v == NULL)
14408 return -1;
14409
Victor Stinnera47082312012-10-04 02:19:54 +020014410
14411 switch (arg->ch) {
Victor Stinnera47082312012-10-04 02:19:54 +020014412 case 's':
14413 case 'r':
14414 case 'a':
14415 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
14416 /* Fast path */
14417 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
14418 return -1;
14419 return 1;
14420 }
14421
14422 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
14423 *p_str = v;
14424 Py_INCREF(*p_str);
14425 }
14426 else {
14427 if (arg->ch == 's')
14428 *p_str = PyObject_Str(v);
14429 else if (arg->ch == 'r')
14430 *p_str = PyObject_Repr(v);
14431 else
14432 *p_str = PyObject_ASCII(v);
14433 }
14434 break;
14435
14436 case 'i':
14437 case 'd':
14438 case 'u':
14439 case 'o':
14440 case 'x':
14441 case 'X':
14442 {
14443 int ret = mainformatlong(v, arg, p_str, writer);
14444 if (ret != 0)
14445 return ret;
14446 arg->sign = 1;
14447 break;
14448 }
14449
14450 case 'e':
14451 case 'E':
14452 case 'f':
14453 case 'F':
14454 case 'g':
14455 case 'G':
14456 if (arg->width == -1 && arg->prec == -1
14457 && !(arg->flags & (F_SIGN | F_BLANK)))
14458 {
14459 /* Fast path */
14460 if (formatfloat(v, arg, NULL, writer) == -1)
14461 return -1;
14462 return 1;
14463 }
14464
14465 arg->sign = 1;
14466 if (formatfloat(v, arg, p_str, NULL) == -1)
14467 return -1;
14468 break;
14469
14470 case 'c':
14471 {
14472 Py_UCS4 ch = formatchar(v);
14473 if (ch == (Py_UCS4) -1)
14474 return -1;
14475 if (arg->width == -1 && arg->prec == -1) {
14476 /* Fast path */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014477 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014478 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014479 return 1;
14480 }
14481 *p_str = PyUnicode_FromOrdinal(ch);
14482 break;
14483 }
14484
14485 default:
14486 PyErr_Format(PyExc_ValueError,
14487 "unsupported format character '%c' (0x%x) "
Victor Stinnera33bce02014-07-04 22:47:46 +020014488 "at index %zd",
Victor Stinnera47082312012-10-04 02:19:54 +020014489 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
14490 (int)arg->ch,
14491 ctx->fmtpos - 1);
14492 return -1;
14493 }
14494 if (*p_str == NULL)
14495 return -1;
14496 assert (PyUnicode_Check(*p_str));
14497 return 0;
14498}
14499
14500static int
14501unicode_format_arg_output(struct unicode_formatter_t *ctx,
14502 struct unicode_format_arg_t *arg,
14503 PyObject *str)
14504{
14505 Py_ssize_t len;
14506 enum PyUnicode_Kind kind;
14507 void *pbuf;
14508 Py_ssize_t pindex;
14509 Py_UCS4 signchar;
14510 Py_ssize_t buflen;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014511 Py_UCS4 maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014512 Py_ssize_t sublen;
14513 _PyUnicodeWriter *writer = &ctx->writer;
14514 Py_UCS4 fill;
14515
14516 fill = ' ';
14517 if (arg->sign && arg->flags & F_ZERO)
14518 fill = '0';
14519
14520 if (PyUnicode_READY(str) == -1)
14521 return -1;
14522
14523 len = PyUnicode_GET_LENGTH(str);
14524 if ((arg->width == -1 || arg->width <= len)
14525 && (arg->prec == -1 || arg->prec >= len)
14526 && !(arg->flags & (F_SIGN | F_BLANK)))
14527 {
14528 /* Fast path */
14529 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
14530 return -1;
14531 return 0;
14532 }
14533
14534 /* Truncate the string for "s", "r" and "a" formats
14535 if the precision is set */
14536 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
14537 if (arg->prec >= 0 && len > arg->prec)
14538 len = arg->prec;
14539 }
14540
14541 /* Adjust sign and width */
14542 kind = PyUnicode_KIND(str);
14543 pbuf = PyUnicode_DATA(str);
14544 pindex = 0;
14545 signchar = '\0';
14546 if (arg->sign) {
14547 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
14548 if (ch == '-' || ch == '+') {
14549 signchar = ch;
14550 len--;
14551 pindex++;
14552 }
14553 else if (arg->flags & F_SIGN)
14554 signchar = '+';
14555 else if (arg->flags & F_BLANK)
14556 signchar = ' ';
14557 else
14558 arg->sign = 0;
14559 }
14560 if (arg->width < len)
14561 arg->width = len;
14562
14563 /* Prepare the writer */
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014564 maxchar = writer->maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014565 if (!(arg->flags & F_LJUST)) {
14566 if (arg->sign) {
14567 if ((arg->width-1) > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014568 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014569 }
14570 else {
14571 if (arg->width > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014572 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014573 }
14574 }
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014575 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
14576 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014577 maxchar = Py_MAX(maxchar, strmaxchar);
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014578 }
14579
Victor Stinnera47082312012-10-04 02:19:54 +020014580 buflen = arg->width;
14581 if (arg->sign && len == arg->width)
14582 buflen++;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014583 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
Victor Stinnera47082312012-10-04 02:19:54 +020014584 return -1;
14585
14586 /* Write the sign if needed */
14587 if (arg->sign) {
14588 if (fill != ' ') {
14589 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14590 writer->pos += 1;
14591 }
14592 if (arg->width > len)
14593 arg->width--;
14594 }
14595
14596 /* Write the numeric prefix for "x", "X" and "o" formats
14597 if the alternate form is used.
14598 For example, write "0x" for the "%#x" format. */
14599 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14600 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14601 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
14602 if (fill != ' ') {
14603 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14604 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14605 writer->pos += 2;
14606 pindex += 2;
14607 }
14608 arg->width -= 2;
14609 if (arg->width < 0)
14610 arg->width = 0;
14611 len -= 2;
14612 }
14613
14614 /* Pad left with the fill character if needed */
14615 if (arg->width > len && !(arg->flags & F_LJUST)) {
14616 sublen = arg->width - len;
14617 FILL(writer->kind, writer->data, fill, writer->pos, sublen);
14618 writer->pos += sublen;
14619 arg->width = len;
14620 }
14621
14622 /* If padding with spaces: write sign if needed and/or numeric prefix if
14623 the alternate form is used */
14624 if (fill == ' ') {
14625 if (arg->sign) {
14626 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14627 writer->pos += 1;
14628 }
14629 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14630 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14631 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
14632 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14633 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14634 writer->pos += 2;
14635 pindex += 2;
14636 }
14637 }
14638
14639 /* Write characters */
14640 if (len) {
14641 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14642 str, pindex, len);
14643 writer->pos += len;
14644 }
14645
14646 /* Pad right with the fill character if needed */
14647 if (arg->width > len) {
14648 sublen = arg->width - len;
14649 FILL(writer->kind, writer->data, ' ', writer->pos, sublen);
14650 writer->pos += sublen;
14651 }
14652 return 0;
14653}
14654
14655/* Helper of PyUnicode_Format(): format one arg.
14656 Return 0 on success, raise an exception and return -1 on error. */
14657static int
14658unicode_format_arg(struct unicode_formatter_t *ctx)
14659{
14660 struct unicode_format_arg_t arg;
14661 PyObject *str;
14662 int ret;
14663
Victor Stinner8dbd4212012-12-04 09:30:24 +010014664 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
14665 arg.flags = 0;
14666 arg.width = -1;
14667 arg.prec = -1;
14668 arg.sign = 0;
14669 str = NULL;
14670
Victor Stinnera47082312012-10-04 02:19:54 +020014671 ret = unicode_format_arg_parse(ctx, &arg);
14672 if (ret == -1)
14673 return -1;
14674
14675 ret = unicode_format_arg_format(ctx, &arg, &str);
14676 if (ret == -1)
14677 return -1;
14678
14679 if (ret != 1) {
14680 ret = unicode_format_arg_output(ctx, &arg, str);
14681 Py_DECREF(str);
14682 if (ret == -1)
14683 return -1;
14684 }
14685
14686 if (ctx->dict && (ctx->argidx < ctx->arglen) && arg.ch != '%') {
14687 PyErr_SetString(PyExc_TypeError,
14688 "not all arguments converted during string formatting");
14689 return -1;
14690 }
14691 return 0;
14692}
14693
Alexander Belopolsky40018472011-02-26 01:02:56 +000014694PyObject *
14695PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014696{
Victor Stinnera47082312012-10-04 02:19:54 +020014697 struct unicode_formatter_t ctx;
Tim Petersced69f82003-09-16 20:30:58 +000014698
Guido van Rossumd57fd912000-03-10 22:53:23 +000014699 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014700 PyErr_BadInternalCall();
14701 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014702 }
Victor Stinnera47082312012-10-04 02:19:54 +020014703
14704 ctx.fmtstr = PyUnicode_FromObject(format);
14705 if (ctx.fmtstr == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000014706 return NULL;
Victor Stinnera47082312012-10-04 02:19:54 +020014707 if (PyUnicode_READY(ctx.fmtstr) == -1) {
14708 Py_DECREF(ctx.fmtstr);
14709 return NULL;
14710 }
14711 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
14712 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
14713 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
14714 ctx.fmtpos = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014715
Victor Stinner8f674cc2013-04-17 23:02:17 +020014716 _PyUnicodeWriter_Init(&ctx.writer);
14717 ctx.writer.min_length = ctx.fmtcnt + 100;
14718 ctx.writer.overallocate = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014719
Guido van Rossumd57fd912000-03-10 22:53:23 +000014720 if (PyTuple_Check(args)) {
Victor Stinnera47082312012-10-04 02:19:54 +020014721 ctx.arglen = PyTuple_Size(args);
14722 ctx.argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014723 }
14724 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014725 ctx.arglen = -1;
14726 ctx.argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014727 }
Victor Stinnera47082312012-10-04 02:19:54 +020014728 ctx.args_owned = 0;
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040014729 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Victor Stinnera47082312012-10-04 02:19:54 +020014730 ctx.dict = args;
14731 else
14732 ctx.dict = NULL;
14733 ctx.args = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014734
Victor Stinnera47082312012-10-04 02:19:54 +020014735 while (--ctx.fmtcnt >= 0) {
14736 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
Victor Stinnercfc4c132013-04-03 01:48:39 +020014737 Py_ssize_t nonfmtpos;
Victor Stinnera47082312012-10-04 02:19:54 +020014738
14739 nonfmtpos = ctx.fmtpos++;
14740 while (ctx.fmtcnt >= 0 &&
14741 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
14742 ctx.fmtpos++;
14743 ctx.fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014744 }
Victor Stinnera47082312012-10-04 02:19:54 +020014745 if (ctx.fmtcnt < 0) {
14746 ctx.fmtpos--;
14747 ctx.writer.overallocate = 0;
Victor Stinnera0494432012-10-03 23:03:46 +020014748 }
Victor Stinneree4544c2012-05-09 22:24:08 +020014749
Victor Stinnercfc4c132013-04-03 01:48:39 +020014750 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
14751 nonfmtpos, ctx.fmtpos) < 0)
14752 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014753 }
14754 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014755 ctx.fmtpos++;
14756 if (unicode_format_arg(&ctx) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000014757 goto onError;
Victor Stinnera47082312012-10-04 02:19:54 +020014758 }
14759 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020014760
Victor Stinnera47082312012-10-04 02:19:54 +020014761 if (ctx.argidx < ctx.arglen && !ctx.dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014762 PyErr_SetString(PyExc_TypeError,
14763 "not all arguments converted during string formatting");
14764 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014765 }
14766
Victor Stinnera47082312012-10-04 02:19:54 +020014767 if (ctx.args_owned) {
14768 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014769 }
Victor Stinnera47082312012-10-04 02:19:54 +020014770 Py_DECREF(ctx.fmtstr);
14771 return _PyUnicodeWriter_Finish(&ctx.writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014772
Benjamin Peterson29060642009-01-31 22:14:21 +000014773 onError:
Victor Stinnera47082312012-10-04 02:19:54 +020014774 Py_DECREF(ctx.fmtstr);
14775 _PyUnicodeWriter_Dealloc(&ctx.writer);
14776 if (ctx.args_owned) {
14777 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014778 }
14779 return NULL;
14780}
14781
Jeremy Hylton938ace62002-07-17 16:30:39 +000014782static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000014783unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
14784
Tim Peters6d6c1a32001-08-02 04:15:00 +000014785static PyObject *
14786unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14787{
Benjamin Peterson29060642009-01-31 22:14:21 +000014788 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014789 static char *kwlist[] = {"object", "encoding", "errors", 0};
14790 char *encoding = NULL;
14791 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000014792
Benjamin Peterson14339b62009-01-31 16:36:08 +000014793 if (type != &PyUnicode_Type)
14794 return unicode_subtype_new(type, args, kwds);
14795 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000014796 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000014797 return NULL;
14798 if (x == NULL)
Serhiy Storchaka678db842013-01-26 12:16:36 +020014799 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson14339b62009-01-31 16:36:08 +000014800 if (encoding == NULL && errors == NULL)
14801 return PyObject_Str(x);
14802 else
Benjamin Peterson29060642009-01-31 22:14:21 +000014803 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000014804}
14805
Guido van Rossume023fe02001-08-30 03:12:59 +000014806static PyObject *
14807unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14808{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014809 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014810 Py_ssize_t length, char_size;
14811 int share_wstr, share_utf8;
14812 unsigned int kind;
14813 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000014814
Benjamin Peterson14339b62009-01-31 16:36:08 +000014815 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014816
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014817 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014818 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014819 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014820 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050014821 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060014822 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014823 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060014824 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014825
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014826 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014827 if (self == NULL) {
14828 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014829 return NULL;
14830 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014831 kind = PyUnicode_KIND(unicode);
14832 length = PyUnicode_GET_LENGTH(unicode);
14833
14834 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014835#ifdef Py_DEBUG
14836 _PyUnicode_HASH(self) = -1;
14837#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014838 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014839#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014840 _PyUnicode_STATE(self).interned = 0;
14841 _PyUnicode_STATE(self).kind = kind;
14842 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020014843 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014844 _PyUnicode_STATE(self).ready = 1;
14845 _PyUnicode_WSTR(self) = NULL;
14846 _PyUnicode_UTF8_LENGTH(self) = 0;
14847 _PyUnicode_UTF8(self) = NULL;
14848 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020014849 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014850
14851 share_utf8 = 0;
14852 share_wstr = 0;
14853 if (kind == PyUnicode_1BYTE_KIND) {
14854 char_size = 1;
14855 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
14856 share_utf8 = 1;
14857 }
14858 else if (kind == PyUnicode_2BYTE_KIND) {
14859 char_size = 2;
14860 if (sizeof(wchar_t) == 2)
14861 share_wstr = 1;
14862 }
14863 else {
14864 assert(kind == PyUnicode_4BYTE_KIND);
14865 char_size = 4;
14866 if (sizeof(wchar_t) == 4)
14867 share_wstr = 1;
14868 }
14869
14870 /* Ensure we won't overflow the length. */
14871 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
14872 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014873 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014874 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014875 data = PyObject_MALLOC((length + 1) * char_size);
14876 if (data == NULL) {
14877 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014878 goto onError;
14879 }
14880
Victor Stinnerc3c74152011-10-02 20:39:55 +020014881 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014882 if (share_utf8) {
14883 _PyUnicode_UTF8_LENGTH(self) = length;
14884 _PyUnicode_UTF8(self) = data;
14885 }
14886 if (share_wstr) {
14887 _PyUnicode_WSTR_LENGTH(self) = length;
14888 _PyUnicode_WSTR(self) = (wchar_t *)data;
14889 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014890
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014891 Py_MEMCPY(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020014892 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014893 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014894#ifdef Py_DEBUG
14895 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
14896#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020014897 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010014898 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014899
14900onError:
14901 Py_DECREF(unicode);
14902 Py_DECREF(self);
14903 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000014904}
14905
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000014906PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070014907"str(object='') -> str\n\
14908str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000014909\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100014910Create a new string object from the given object. If encoding or\n\
14911errors is specified, then the object must expose a data buffer\n\
14912that will be decoded using the given encoding and error handler.\n\
14913Otherwise, returns the result of object.__str__() (if defined)\n\
14914or repr(object).\n\
14915encoding defaults to sys.getdefaultencoding().\n\
14916errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000014917
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014918static PyObject *unicode_iter(PyObject *seq);
14919
Guido van Rossumd57fd912000-03-10 22:53:23 +000014920PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000014921 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014922 "str", /* tp_name */
14923 sizeof(PyUnicodeObject), /* tp_size */
14924 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014925 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014926 (destructor)unicode_dealloc, /* tp_dealloc */
14927 0, /* tp_print */
14928 0, /* tp_getattr */
14929 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014930 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014931 unicode_repr, /* tp_repr */
14932 &unicode_as_number, /* tp_as_number */
14933 &unicode_as_sequence, /* tp_as_sequence */
14934 &unicode_as_mapping, /* tp_as_mapping */
14935 (hashfunc) unicode_hash, /* tp_hash*/
14936 0, /* tp_call*/
14937 (reprfunc) unicode_str, /* tp_str */
14938 PyObject_GenericGetAttr, /* tp_getattro */
14939 0, /* tp_setattro */
14940 0, /* tp_as_buffer */
14941 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000014942 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014943 unicode_doc, /* tp_doc */
14944 0, /* tp_traverse */
14945 0, /* tp_clear */
14946 PyUnicode_RichCompare, /* tp_richcompare */
14947 0, /* tp_weaklistoffset */
14948 unicode_iter, /* tp_iter */
14949 0, /* tp_iternext */
14950 unicode_methods, /* tp_methods */
14951 0, /* tp_members */
14952 0, /* tp_getset */
14953 &PyBaseObject_Type, /* tp_base */
14954 0, /* tp_dict */
14955 0, /* tp_descr_get */
14956 0, /* tp_descr_set */
14957 0, /* tp_dictoffset */
14958 0, /* tp_init */
14959 0, /* tp_alloc */
14960 unicode_new, /* tp_new */
14961 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014962};
14963
14964/* Initialize the Unicode implementation */
14965
Victor Stinner3a50e702011-10-18 21:21:00 +020014966int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014967{
Thomas Wouters477c8d52006-05-27 19:21:47 +000014968 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014969 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000014970 0x000A, /* LINE FEED */
14971 0x000D, /* CARRIAGE RETURN */
14972 0x001C, /* FILE SEPARATOR */
14973 0x001D, /* GROUP SEPARATOR */
14974 0x001E, /* RECORD SEPARATOR */
14975 0x0085, /* NEXT LINE */
14976 0x2028, /* LINE SEPARATOR */
14977 0x2029, /* PARAGRAPH SEPARATOR */
14978 };
14979
Fred Drakee4315f52000-05-09 19:53:39 +000014980 /* Init the implementation */
Serhiy Storchaka678db842013-01-26 12:16:36 +020014981 _Py_INCREF_UNICODE_EMPTY();
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014982 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014983 Py_FatalError("Can't create empty string");
Serhiy Storchaka678db842013-01-26 12:16:36 +020014984 Py_DECREF(unicode_empty);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014985
Guido van Rossumcacfc072002-05-24 19:01:59 +000014986 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014987 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000014988
14989 /* initialize the linebreak bloom filter */
14990 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014991 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020014992 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014993
Christian Heimes26532f72013-07-20 14:57:16 +020014994 if (PyType_Ready(&EncodingMapType) < 0)
14995 Py_FatalError("Can't initialize encoding map type");
Victor Stinner3a50e702011-10-18 21:21:00 +020014996
Benjamin Petersonc4311282012-10-30 23:21:10 -040014997 if (PyType_Ready(&PyFieldNameIter_Type) < 0)
14998 Py_FatalError("Can't initialize field name iterator type");
14999
15000 if (PyType_Ready(&PyFormatterIter_Type) < 0)
15001 Py_FatalError("Can't initialize formatter iter type");
Benjamin Petersone8ea97f2012-10-30 23:27:52 -040015002
Victor Stinner3a50e702011-10-18 21:21:00 +020015003#ifdef HAVE_MBCS
15004 winver.dwOSVersionInfoSize = sizeof(winver);
15005 if (!GetVersionEx((OSVERSIONINFO*)&winver)) {
15006 PyErr_SetFromWindowsErr(0);
15007 return -1;
15008 }
15009#endif
15010 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015011}
15012
15013/* Finalize the Unicode implementation */
15014
Christian Heimesa156e092008-02-16 07:38:31 +000015015int
15016PyUnicode_ClearFreeList(void)
15017{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015018 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000015019}
15020
Guido van Rossumd57fd912000-03-10 22:53:23 +000015021void
Thomas Wouters78890102000-07-22 19:25:51 +000015022_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015023{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000015024 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015025
Serhiy Storchaka05997252013-01-26 12:14:02 +020015026 Py_CLEAR(unicode_empty);
Barry Warsaw5b4c2282000-10-03 20:45:26 +000015027
Serhiy Storchaka05997252013-01-26 12:14:02 +020015028 for (i = 0; i < 256; i++)
15029 Py_CLEAR(unicode_latin1[i]);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020015030 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000015031 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000015032}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000015033
Walter Dörwald16807132007-05-25 13:52:07 +000015034void
15035PyUnicode_InternInPlace(PyObject **p)
15036{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015037 PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015038 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020015039#ifdef Py_DEBUG
15040 assert(s != NULL);
15041 assert(_PyUnicode_CHECK(s));
15042#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000015043 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020015044 return;
15045#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015046 /* If it's a subclass, we don't really know what putting
15047 it in the interned dict might do. */
15048 if (!PyUnicode_CheckExact(s))
15049 return;
15050 if (PyUnicode_CHECK_INTERNED(s))
15051 return;
15052 if (interned == NULL) {
15053 interned = PyDict_New();
15054 if (interned == NULL) {
15055 PyErr_Clear(); /* Don't leave an exception */
15056 return;
15057 }
15058 }
15059 /* It might be that the GetItem call fails even
15060 though the key is present in the dictionary,
15061 namely when this happens during a stack overflow. */
15062 Py_ALLOW_RECURSION
Victor Stinner7931d9a2011-11-04 00:22:48 +010015063 t = PyDict_GetItem(interned, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015064 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000015065
Victor Stinnerf0335102013-04-14 19:13:03 +020015066 if (t) {
15067 Py_INCREF(t);
15068 Py_DECREF(*p);
15069 *p = t;
15070 return;
15071 }
Walter Dörwald16807132007-05-25 13:52:07 +000015072
Benjamin Peterson14339b62009-01-31 16:36:08 +000015073 PyThreadState_GET()->recursion_critical = 1;
Victor Stinner7931d9a2011-11-04 00:22:48 +010015074 if (PyDict_SetItem(interned, s, s) < 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015075 PyErr_Clear();
15076 PyThreadState_GET()->recursion_critical = 0;
15077 return;
15078 }
15079 PyThreadState_GET()->recursion_critical = 0;
15080 /* The two references in interned are not counted by refcnt.
15081 The deallocator will take care of this */
15082 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015083 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000015084}
15085
15086void
15087PyUnicode_InternImmortal(PyObject **p)
15088{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015089 PyUnicode_InternInPlace(p);
15090 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020015091 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015092 Py_INCREF(*p);
15093 }
Walter Dörwald16807132007-05-25 13:52:07 +000015094}
15095
15096PyObject *
15097PyUnicode_InternFromString(const char *cp)
15098{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015099 PyObject *s = PyUnicode_FromString(cp);
15100 if (s == NULL)
15101 return NULL;
15102 PyUnicode_InternInPlace(&s);
15103 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000015104}
15105
Alexander Belopolsky40018472011-02-26 01:02:56 +000015106void
15107_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000015108{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015109 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015110 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015111 Py_ssize_t i, n;
15112 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000015113
Benjamin Peterson14339b62009-01-31 16:36:08 +000015114 if (interned == NULL || !PyDict_Check(interned))
15115 return;
15116 keys = PyDict_Keys(interned);
15117 if (keys == NULL || !PyList_Check(keys)) {
15118 PyErr_Clear();
15119 return;
15120 }
Walter Dörwald16807132007-05-25 13:52:07 +000015121
Benjamin Peterson14339b62009-01-31 16:36:08 +000015122 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
15123 detector, interned unicode strings are not forcibly deallocated;
15124 rather, we give them their stolen references back, and then clear
15125 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000015126
Benjamin Peterson14339b62009-01-31 16:36:08 +000015127 n = PyList_GET_SIZE(keys);
15128 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000015129 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015130 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015131 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015132 if (PyUnicode_READY(s) == -1) {
15133 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015134 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015135 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015136 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015137 case SSTATE_NOT_INTERNED:
15138 /* XXX Shouldn't happen */
15139 break;
15140 case SSTATE_INTERNED_IMMORTAL:
15141 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015142 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015143 break;
15144 case SSTATE_INTERNED_MORTAL:
15145 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015146 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015147 break;
15148 default:
15149 Py_FatalError("Inconsistent interned string state.");
15150 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015151 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015152 }
15153 fprintf(stderr, "total size of all interned strings: "
15154 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
15155 "mortal/immortal\n", mortal_size, immortal_size);
15156 Py_DECREF(keys);
15157 PyDict_Clear(interned);
Serhiy Storchaka05997252013-01-26 12:14:02 +020015158 Py_CLEAR(interned);
Walter Dörwald16807132007-05-25 13:52:07 +000015159}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015160
15161
15162/********************* Unicode Iterator **************************/
15163
15164typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015165 PyObject_HEAD
15166 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015167 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015168} unicodeiterobject;
15169
15170static void
15171unicodeiter_dealloc(unicodeiterobject *it)
15172{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015173 _PyObject_GC_UNTRACK(it);
15174 Py_XDECREF(it->it_seq);
15175 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015176}
15177
15178static int
15179unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15180{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015181 Py_VISIT(it->it_seq);
15182 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015183}
15184
15185static PyObject *
15186unicodeiter_next(unicodeiterobject *it)
15187{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015188 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015189
Benjamin Peterson14339b62009-01-31 16:36:08 +000015190 assert(it != NULL);
15191 seq = it->it_seq;
15192 if (seq == NULL)
15193 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015194 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015195
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015196 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15197 int kind = PyUnicode_KIND(seq);
15198 void *data = PyUnicode_DATA(seq);
15199 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15200 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015201 if (item != NULL)
15202 ++it->it_index;
15203 return item;
15204 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015205
Benjamin Peterson14339b62009-01-31 16:36:08 +000015206 Py_DECREF(seq);
15207 it->it_seq = NULL;
15208 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015209}
15210
15211static PyObject *
15212unicodeiter_len(unicodeiterobject *it)
15213{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015214 Py_ssize_t len = 0;
15215 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020015216 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015217 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015218}
15219
15220PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15221
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015222static PyObject *
15223unicodeiter_reduce(unicodeiterobject *it)
15224{
15225 if (it->it_seq != NULL) {
Antoine Pitroua7013882012-04-05 00:04:20 +020015226 return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015227 it->it_seq, it->it_index);
15228 } else {
15229 PyObject *u = PyUnicode_FromUnicode(NULL, 0);
15230 if (u == NULL)
15231 return NULL;
Antoine Pitroua7013882012-04-05 00:04:20 +020015232 return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015233 }
15234}
15235
15236PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15237
15238static PyObject *
15239unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15240{
15241 Py_ssize_t index = PyLong_AsSsize_t(state);
15242 if (index == -1 && PyErr_Occurred())
15243 return NULL;
Kristján Valur Jónsson25dded02014-03-05 13:47:57 +000015244 if (it->it_seq != NULL) {
15245 if (index < 0)
15246 index = 0;
15247 else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15248 index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15249 it->it_index = index;
15250 }
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015251 Py_RETURN_NONE;
15252}
15253
15254PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15255
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015256static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015257 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000015258 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015259 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15260 reduce_doc},
15261 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
15262 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000015263 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015264};
15265
15266PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015267 PyVarObject_HEAD_INIT(&PyType_Type, 0)
15268 "str_iterator", /* tp_name */
15269 sizeof(unicodeiterobject), /* tp_basicsize */
15270 0, /* tp_itemsize */
15271 /* methods */
15272 (destructor)unicodeiter_dealloc, /* tp_dealloc */
15273 0, /* tp_print */
15274 0, /* tp_getattr */
15275 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000015276 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015277 0, /* tp_repr */
15278 0, /* tp_as_number */
15279 0, /* tp_as_sequence */
15280 0, /* tp_as_mapping */
15281 0, /* tp_hash */
15282 0, /* tp_call */
15283 0, /* tp_str */
15284 PyObject_GenericGetAttr, /* tp_getattro */
15285 0, /* tp_setattro */
15286 0, /* tp_as_buffer */
15287 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15288 0, /* tp_doc */
15289 (traverseproc)unicodeiter_traverse, /* tp_traverse */
15290 0, /* tp_clear */
15291 0, /* tp_richcompare */
15292 0, /* tp_weaklistoffset */
15293 PyObject_SelfIter, /* tp_iter */
15294 (iternextfunc)unicodeiter_next, /* tp_iternext */
15295 unicodeiter_methods, /* tp_methods */
15296 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015297};
15298
15299static PyObject *
15300unicode_iter(PyObject *seq)
15301{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015302 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015303
Benjamin Peterson14339b62009-01-31 16:36:08 +000015304 if (!PyUnicode_Check(seq)) {
15305 PyErr_BadInternalCall();
15306 return NULL;
15307 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015308 if (PyUnicode_READY(seq) == -1)
15309 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015310 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15311 if (it == NULL)
15312 return NULL;
15313 it->it_index = 0;
15314 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015315 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015316 _PyObject_GC_TRACK(it);
15317 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015318}
15319
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015320
15321size_t
15322Py_UNICODE_strlen(const Py_UNICODE *u)
15323{
15324 int res = 0;
15325 while(*u++)
15326 res++;
15327 return res;
15328}
15329
15330Py_UNICODE*
15331Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
15332{
15333 Py_UNICODE *u = s1;
15334 while ((*u++ = *s2++));
15335 return s1;
15336}
15337
15338Py_UNICODE*
15339Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15340{
15341 Py_UNICODE *u = s1;
15342 while ((*u++ = *s2++))
15343 if (n-- == 0)
15344 break;
15345 return s1;
15346}
15347
15348Py_UNICODE*
15349Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
15350{
15351 Py_UNICODE *u1 = s1;
15352 u1 += Py_UNICODE_strlen(u1);
15353 Py_UNICODE_strcpy(u1, s2);
15354 return s1;
15355}
15356
15357int
15358Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
15359{
15360 while (*s1 && *s2 && *s1 == *s2)
15361 s1++, s2++;
15362 if (*s1 && *s2)
15363 return (*s1 < *s2) ? -1 : +1;
15364 if (*s1)
15365 return 1;
15366 if (*s2)
15367 return -1;
15368 return 0;
15369}
15370
15371int
15372Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15373{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015374 Py_UNICODE u1, u2;
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015375 for (; n != 0; n--) {
15376 u1 = *s1;
15377 u2 = *s2;
15378 if (u1 != u2)
15379 return (u1 < u2) ? -1 : +1;
15380 if (u1 == '\0')
15381 return 0;
15382 s1++;
15383 s2++;
15384 }
15385 return 0;
15386}
15387
15388Py_UNICODE*
15389Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
15390{
15391 const Py_UNICODE *p;
15392 for (p = s; *p; p++)
15393 if (*p == c)
15394 return (Py_UNICODE*)p;
15395 return NULL;
15396}
15397
15398Py_UNICODE*
15399Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
15400{
15401 const Py_UNICODE *p;
15402 p = s + Py_UNICODE_strlen(s);
15403 while (p != s) {
15404 p--;
15405 if (*p == c)
15406 return (Py_UNICODE*)p;
15407 }
15408 return NULL;
15409}
Victor Stinner331ea922010-08-10 16:37:20 +000015410
Victor Stinner71133ff2010-09-01 23:43:53 +000015411Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015412PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000015413{
Victor Stinner577db2c2011-10-11 22:12:48 +020015414 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015415 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000015416
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015417 if (!PyUnicode_Check(unicode)) {
15418 PyErr_BadArgument();
15419 return NULL;
15420 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015421 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020015422 if (u == NULL)
15423 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000015424 /* Ensure we won't overflow the size. */
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015425 if (len > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000015426 PyErr_NoMemory();
15427 return NULL;
15428 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015429 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000015430 size *= sizeof(Py_UNICODE);
15431 copy = PyMem_Malloc(size);
15432 if (copy == NULL) {
15433 PyErr_NoMemory();
15434 return NULL;
15435 }
Victor Stinner577db2c2011-10-11 22:12:48 +020015436 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000015437 return copy;
15438}
Martin v. Löwis5b222132007-06-10 09:51:05 +000015439
Georg Brandl66c221e2010-10-14 07:04:07 +000015440/* A _string module, to export formatter_parser and formatter_field_name_split
15441 to the string.Formatter class implemented in Python. */
15442
15443static PyMethodDef _string_methods[] = {
15444 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
15445 METH_O, PyDoc_STR("split the argument as a field name")},
15446 {"formatter_parser", (PyCFunction) formatter_parser,
15447 METH_O, PyDoc_STR("parse the argument as a format string")},
15448 {NULL, NULL}
15449};
15450
15451static struct PyModuleDef _string_module = {
15452 PyModuleDef_HEAD_INIT,
15453 "_string",
15454 PyDoc_STR("string helper module"),
15455 0,
15456 _string_methods,
15457 NULL,
15458 NULL,
15459 NULL,
15460 NULL
15461};
15462
15463PyMODINIT_FUNC
15464PyInit__string(void)
15465{
15466 return PyModule_Create(&_string_module);
15467}
15468
15469
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000015470#ifdef __cplusplus
15471}
15472#endif