blob: 1ce10cfd903ab2d282d2ba1b74ada29973bb8172 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000046#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000047#include <windows.h>
48#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000049
Larry Hastings61272b72014-01-07 12:41:53 -080050/*[clinic input]
Larry Hastings44e2eaa2013-11-23 15:37:55 -080051class str
Larry Hastings61272b72014-01-07 12:41:53 -080052[clinic start generated code]*/
53/*[clinic end generated code: checksum=da39a3ee5e6b4b0d3255bfef95601890afd80709]*/
Larry Hastings44e2eaa2013-11-23 15:37:55 -080054
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000055/* --- Globals ------------------------------------------------------------
56
Serhiy Storchaka05997252013-01-26 12:14:02 +020057NOTE: In the interpreter's initialization phase, some globals are currently
58 initialized dynamically as needed. In the process Unicode objects may
59 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000060
61*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000062
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000063
64#ifdef __cplusplus
65extern "C" {
66#endif
67
Victor Stinner8faf8212011-12-08 22:14:11 +010068/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
69#define MAX_UNICODE 0x10ffff
70
Victor Stinner910337b2011-10-03 03:20:16 +020071#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020072# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020073#else
74# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
75#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020076
Victor Stinnere90fe6a2011-10-01 16:48:13 +020077#define _PyUnicode_UTF8(op) \
78 (((PyCompactUnicodeObject*)(op))->utf8)
79#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020080 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020081 assert(PyUnicode_IS_READY(op)), \
82 PyUnicode_IS_COMPACT_ASCII(op) ? \
83 ((char*)((PyASCIIObject*)(op) + 1)) : \
84 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +020085#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020086 (((PyCompactUnicodeObject*)(op))->utf8_length)
87#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020088 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020089 assert(PyUnicode_IS_READY(op)), \
90 PyUnicode_IS_COMPACT_ASCII(op) ? \
91 ((PyASCIIObject*)(op))->length : \
92 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +020093#define _PyUnicode_WSTR(op) \
94 (((PyASCIIObject*)(op))->wstr)
95#define _PyUnicode_WSTR_LENGTH(op) \
96 (((PyCompactUnicodeObject*)(op))->wstr_length)
97#define _PyUnicode_LENGTH(op) \
98 (((PyASCIIObject *)(op))->length)
99#define _PyUnicode_STATE(op) \
100 (((PyASCIIObject *)(op))->state)
101#define _PyUnicode_HASH(op) \
102 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200103#define _PyUnicode_KIND(op) \
104 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200105 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200106#define _PyUnicode_GET_LENGTH(op) \
107 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200108 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200109#define _PyUnicode_DATA_ANY(op) \
110 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200111
Victor Stinner910337b2011-10-03 03:20:16 +0200112#undef PyUnicode_READY
113#define PyUnicode_READY(op) \
114 (assert(_PyUnicode_CHECK(op)), \
115 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200116 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100117 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200118
Victor Stinnerc379ead2011-10-03 12:52:27 +0200119#define _PyUnicode_SHARE_UTF8(op) \
120 (assert(_PyUnicode_CHECK(op)), \
121 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
122 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
123#define _PyUnicode_SHARE_WSTR(op) \
124 (assert(_PyUnicode_CHECK(op)), \
125 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
126
Victor Stinner829c0ad2011-10-03 01:08:02 +0200127/* true if the Unicode object has an allocated UTF-8 memory block
128 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200129#define _PyUnicode_HAS_UTF8_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200130 ((!PyUnicode_IS_COMPACT_ASCII(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200131 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200132 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
133
Victor Stinner03490912011-10-03 23:45:12 +0200134/* true if the Unicode object has an allocated wstr memory block
135 (not shared with other data) */
136#define _PyUnicode_HAS_WSTR_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200137 ((_PyUnicode_WSTR(op) && \
Victor Stinner03490912011-10-03 23:45:12 +0200138 (!PyUnicode_IS_READY(op) || \
139 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
140
Victor Stinner910337b2011-10-03 03:20:16 +0200141/* Generic helper macro to convert characters of different types.
142 from_type and to_type have to be valid type names, begin and end
143 are pointers to the source characters which should be of type
144 "from_type *". to is a pointer of type "to_type *" and points to the
145 buffer where the result characters are written to. */
146#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
147 do { \
Victor Stinner4a587072013-11-19 12:54:53 +0100148 to_type *_to = (to_type *)(to); \
149 const from_type *_iter = (from_type *)(begin); \
150 const from_type *_end = (from_type *)(end); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200151 Py_ssize_t n = (_end) - (_iter); \
152 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200153 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200154 while (_iter < (_unrolled_end)) { \
155 _to[0] = (to_type) _iter[0]; \
156 _to[1] = (to_type) _iter[1]; \
157 _to[2] = (to_type) _iter[2]; \
158 _to[3] = (to_type) _iter[3]; \
159 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200160 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200161 while (_iter < (_end)) \
162 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200163 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200164
Walter Dörwald16807132007-05-25 13:52:07 +0000165/* This dictionary holds all interned unicode strings. Note that references
166 to strings in this dictionary are *not* counted in the string's ob_refcnt.
167 When the interned string reaches a refcnt of 0 the string deallocation
168 function will delete the reference from this dictionary.
169
170 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000171 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000172*/
Serhiy Storchaka05997252013-01-26 12:14:02 +0200173static PyObject *interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +0000174
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000175/* The empty Unicode object is shared to improve performance. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200176static PyObject *unicode_empty = NULL;
Serhiy Storchaka05997252013-01-26 12:14:02 +0200177
Serhiy Storchaka678db842013-01-26 12:16:36 +0200178#define _Py_INCREF_UNICODE_EMPTY() \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200179 do { \
180 if (unicode_empty != NULL) \
181 Py_INCREF(unicode_empty); \
182 else { \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200183 unicode_empty = PyUnicode_New(0, 0); \
184 if (unicode_empty != NULL) { \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200185 Py_INCREF(unicode_empty); \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200186 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
187 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200188 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200189 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000190
Serhiy Storchaka678db842013-01-26 12:16:36 +0200191#define _Py_RETURN_UNICODE_EMPTY() \
192 do { \
193 _Py_INCREF_UNICODE_EMPTY(); \
194 return unicode_empty; \
195 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000196
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200197/* Forward declaration */
198Py_LOCAL_INLINE(int)
199_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
200
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200201/* List of static strings. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200202static _Py_Identifier *static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200203
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000204/* Single character Unicode strings in the Latin-1 range are being
205 shared as well. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200206static PyObject *unicode_latin1[256] = {NULL};
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000207
Christian Heimes190d79e2008-01-30 11:58:22 +0000208/* Fast detection of the most frequent whitespace characters */
209const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000210 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000211/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000212/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000213/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000214/* case 0x000C: * FORM FEED */
215/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000216 0, 1, 1, 1, 1, 1, 0, 0,
217 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000218/* case 0x001C: * FILE SEPARATOR */
219/* case 0x001D: * GROUP SEPARATOR */
220/* case 0x001E: * RECORD SEPARATOR */
221/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000222 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000223/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000224 1, 0, 0, 0, 0, 0, 0, 0,
225 0, 0, 0, 0, 0, 0, 0, 0,
226 0, 0, 0, 0, 0, 0, 0, 0,
227 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000228
Benjamin Peterson14339b62009-01-31 16:36:08 +0000229 0, 0, 0, 0, 0, 0, 0, 0,
230 0, 0, 0, 0, 0, 0, 0, 0,
231 0, 0, 0, 0, 0, 0, 0, 0,
232 0, 0, 0, 0, 0, 0, 0, 0,
233 0, 0, 0, 0, 0, 0, 0, 0,
234 0, 0, 0, 0, 0, 0, 0, 0,
235 0, 0, 0, 0, 0, 0, 0, 0,
236 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000237};
238
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200239/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200240static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200241static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100242static int unicode_modifiable(PyObject *unicode);
243
Victor Stinnerfe226c02011-10-03 03:52:20 +0200244
Alexander Belopolsky40018472011-02-26 01:02:56 +0000245static PyObject *
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100246_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200247static PyObject *
248_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
249static PyObject *
250_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
251
252static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000253unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000254 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100255 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000256 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
257
Alexander Belopolsky40018472011-02-26 01:02:56 +0000258static void
259raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300260 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100261 PyObject *unicode,
262 Py_ssize_t startpos, Py_ssize_t endpos,
263 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000264
Christian Heimes190d79e2008-01-30 11:58:22 +0000265/* Same for linebreaks */
266static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000267 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000268/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000269/* 0x000B, * LINE TABULATION */
270/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000271/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000272 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000273 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000274/* 0x001C, * FILE SEPARATOR */
275/* 0x001D, * GROUP SEPARATOR */
276/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000277 0, 0, 0, 0, 1, 1, 1, 0,
278 0, 0, 0, 0, 0, 0, 0, 0,
279 0, 0, 0, 0, 0, 0, 0, 0,
280 0, 0, 0, 0, 0, 0, 0, 0,
281 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000282
Benjamin Peterson14339b62009-01-31 16:36:08 +0000283 0, 0, 0, 0, 0, 0, 0, 0,
284 0, 0, 0, 0, 0, 0, 0, 0,
285 0, 0, 0, 0, 0, 0, 0, 0,
286 0, 0, 0, 0, 0, 0, 0, 0,
287 0, 0, 0, 0, 0, 0, 0, 0,
288 0, 0, 0, 0, 0, 0, 0, 0,
289 0, 0, 0, 0, 0, 0, 0, 0,
290 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000291};
292
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300293/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
294 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000295Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000296PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000297{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000298#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000299 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000300#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000301 /* This is actually an illegal character, so it should
302 not be passed to unichr. */
303 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000304#endif
305}
306
Victor Stinner910337b2011-10-03 03:20:16 +0200307#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200308int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100309_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200310{
311 PyASCIIObject *ascii;
312 unsigned int kind;
313
314 assert(PyUnicode_Check(op));
315
316 ascii = (PyASCIIObject *)op;
317 kind = ascii->state.kind;
318
Victor Stinnera3b334d2011-10-03 13:53:37 +0200319 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200320 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200321 assert(ascii->state.ready == 1);
322 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200323 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200324 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200325 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200326
Victor Stinnera41463c2011-10-04 01:05:08 +0200327 if (ascii->state.compact == 1) {
328 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200329 assert(kind == PyUnicode_1BYTE_KIND
330 || kind == PyUnicode_2BYTE_KIND
331 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200332 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200333 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200334 assert (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100335 }
336 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200337 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
338
339 data = unicode->data.any;
340 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnere30c0a12011-11-04 20:54:05 +0100341 assert(ascii->length == 0);
342 assert(ascii->hash == -1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200343 assert(ascii->state.compact == 0);
344 assert(ascii->state.ascii == 0);
345 assert(ascii->state.ready == 0);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100346 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
Victor Stinnera41463c2011-10-04 01:05:08 +0200347 assert(ascii->wstr != NULL);
348 assert(data == NULL);
349 assert(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200350 }
351 else {
352 assert(kind == PyUnicode_1BYTE_KIND
353 || kind == PyUnicode_2BYTE_KIND
354 || kind == PyUnicode_4BYTE_KIND);
355 assert(ascii->state.compact == 0);
356 assert(ascii->state.ready == 1);
357 assert(data != NULL);
358 if (ascii->state.ascii) {
359 assert (compact->utf8 == data);
360 assert (compact->utf8_length == ascii->length);
361 }
362 else
363 assert (compact->utf8 != data);
364 }
365 }
366 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200367 if (
368#if SIZEOF_WCHAR_T == 2
369 kind == PyUnicode_2BYTE_KIND
370#else
371 kind == PyUnicode_4BYTE_KIND
372#endif
373 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200374 {
375 assert(ascii->wstr == data);
376 assert(compact->wstr_length == ascii->length);
377 } else
378 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200379 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200380
381 if (compact->utf8 == NULL)
382 assert(compact->utf8_length == 0);
383 if (ascii->wstr == NULL)
384 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200385 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200386 /* check that the best kind is used */
387 if (check_content && kind != PyUnicode_WCHAR_KIND)
388 {
389 Py_ssize_t i;
390 Py_UCS4 maxchar = 0;
Victor Stinner718fbf02012-04-26 00:39:37 +0200391 void *data;
392 Py_UCS4 ch;
393
394 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200395 for (i=0; i < ascii->length; i++)
396 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200397 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200398 if (ch > maxchar)
399 maxchar = ch;
400 }
401 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100402 if (ascii->state.ascii == 0) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200403 assert(maxchar >= 128);
Victor Stinner77faf692011-11-20 18:56:05 +0100404 assert(maxchar <= 255);
405 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200406 else
407 assert(maxchar < 128);
408 }
Victor Stinner77faf692011-11-20 18:56:05 +0100409 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200410 assert(maxchar >= 0x100);
Victor Stinner77faf692011-11-20 18:56:05 +0100411 assert(maxchar <= 0xFFFF);
412 }
413 else {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200414 assert(maxchar >= 0x10000);
Victor Stinner8faf8212011-12-08 22:14:11 +0100415 assert(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100416 }
Victor Stinner718fbf02012-04-26 00:39:37 +0200417 assert(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200418 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400419 return 1;
420}
Victor Stinner910337b2011-10-03 03:20:16 +0200421#endif
422
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100423static PyObject*
424unicode_result_wchar(PyObject *unicode)
425{
426#ifndef Py_DEBUG
427 Py_ssize_t len;
428
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100429 len = _PyUnicode_WSTR_LENGTH(unicode);
430 if (len == 0) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100431 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200432 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100433 }
434
435 if (len == 1) {
436 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100437 if ((Py_UCS4)ch < 256) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100438 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
439 Py_DECREF(unicode);
440 return latin1_char;
441 }
442 }
443
444 if (_PyUnicode_Ready(unicode) < 0) {
Victor Stinneraa771272012-10-04 02:32:58 +0200445 Py_DECREF(unicode);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100446 return NULL;
447 }
448#else
Victor Stinneraa771272012-10-04 02:32:58 +0200449 assert(Py_REFCNT(unicode) == 1);
450
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100451 /* don't make the result ready in debug mode to ensure that the caller
452 makes the string ready before using it */
453 assert(_PyUnicode_CheckConsistency(unicode, 1));
454#endif
455 return unicode;
456}
457
458static PyObject*
459unicode_result_ready(PyObject *unicode)
460{
461 Py_ssize_t length;
462
463 length = PyUnicode_GET_LENGTH(unicode);
464 if (length == 0) {
465 if (unicode != unicode_empty) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100466 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200467 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100468 }
469 return unicode_empty;
470 }
471
472 if (length == 1) {
Victor Stinner69ed0f42013-04-09 21:48:24 +0200473 void *data = PyUnicode_DATA(unicode);
474 int kind = PyUnicode_KIND(unicode);
475 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100476 if (ch < 256) {
477 PyObject *latin1_char = unicode_latin1[ch];
478 if (latin1_char != NULL) {
479 if (unicode != latin1_char) {
480 Py_INCREF(latin1_char);
481 Py_DECREF(unicode);
482 }
483 return latin1_char;
484 }
485 else {
486 assert(_PyUnicode_CheckConsistency(unicode, 1));
487 Py_INCREF(unicode);
488 unicode_latin1[ch] = unicode;
489 return unicode;
490 }
491 }
492 }
493
494 assert(_PyUnicode_CheckConsistency(unicode, 1));
495 return unicode;
496}
497
498static PyObject*
499unicode_result(PyObject *unicode)
500{
501 assert(_PyUnicode_CHECK(unicode));
502 if (PyUnicode_IS_READY(unicode))
503 return unicode_result_ready(unicode);
504 else
505 return unicode_result_wchar(unicode);
506}
507
Victor Stinnerc4b49542011-12-11 22:44:26 +0100508static PyObject*
509unicode_result_unchanged(PyObject *unicode)
510{
511 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500512 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100513 return NULL;
514 Py_INCREF(unicode);
515 return unicode;
516 }
517 else
518 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100519 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100520}
521
Victor Stinner3a50e702011-10-18 21:21:00 +0200522#ifdef HAVE_MBCS
523static OSVERSIONINFOEX winver;
524#endif
525
Thomas Wouters477c8d52006-05-27 19:21:47 +0000526/* --- Bloom Filters ----------------------------------------------------- */
527
528/* stuff to implement simple "bloom filters" for Unicode characters.
529 to keep things simple, we use a single bitmask, using the least 5
530 bits from each unicode characters as the bit index. */
531
532/* the linebreak mask is set up by Unicode_Init below */
533
Antoine Pitrouf068f942010-01-13 14:19:12 +0000534#if LONG_BIT >= 128
535#define BLOOM_WIDTH 128
536#elif LONG_BIT >= 64
537#define BLOOM_WIDTH 64
538#elif LONG_BIT >= 32
539#define BLOOM_WIDTH 32
540#else
541#error "LONG_BIT is smaller than 32"
542#endif
543
Thomas Wouters477c8d52006-05-27 19:21:47 +0000544#define BLOOM_MASK unsigned long
545
Serhiy Storchaka05997252013-01-26 12:14:02 +0200546static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000547
Antoine Pitrouf068f942010-01-13 14:19:12 +0000548#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000549
Benjamin Peterson29060642009-01-31 22:14:21 +0000550#define BLOOM_LINEBREAK(ch) \
551 ((ch) < 128U ? ascii_linebreak[(ch)] : \
552 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000553
Alexander Belopolsky40018472011-02-26 01:02:56 +0000554Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200555make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000556{
Victor Stinnera85af502013-04-09 21:53:54 +0200557#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
558 do { \
559 TYPE *data = (TYPE *)PTR; \
560 TYPE *end = data + LEN; \
561 Py_UCS4 ch; \
562 for (; data != end; data++) { \
563 ch = *data; \
564 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
565 } \
566 break; \
567 } while (0)
568
Thomas Wouters477c8d52006-05-27 19:21:47 +0000569 /* calculate simple bloom-style bitmask for a given unicode string */
570
Antoine Pitrouf068f942010-01-13 14:19:12 +0000571 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000572
573 mask = 0;
Victor Stinnera85af502013-04-09 21:53:54 +0200574 switch (kind) {
575 case PyUnicode_1BYTE_KIND:
576 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
577 break;
578 case PyUnicode_2BYTE_KIND:
579 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
580 break;
581 case PyUnicode_4BYTE_KIND:
582 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
583 break;
584 default:
585 assert(0);
586 }
Thomas Wouters477c8d52006-05-27 19:21:47 +0000587 return mask;
Victor Stinnera85af502013-04-09 21:53:54 +0200588
589#undef BLOOM_UPDATE
Thomas Wouters477c8d52006-05-27 19:21:47 +0000590}
591
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200592/* Compilation of templated routines */
593
594#include "stringlib/asciilib.h"
595#include "stringlib/fastsearch.h"
596#include "stringlib/partition.h"
597#include "stringlib/split.h"
598#include "stringlib/count.h"
599#include "stringlib/find.h"
600#include "stringlib/find_max_char.h"
601#include "stringlib/localeutil.h"
602#include "stringlib/undef.h"
603
604#include "stringlib/ucs1lib.h"
605#include "stringlib/fastsearch.h"
606#include "stringlib/partition.h"
607#include "stringlib/split.h"
608#include "stringlib/count.h"
609#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300610#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200611#include "stringlib/find_max_char.h"
612#include "stringlib/localeutil.h"
613#include "stringlib/undef.h"
614
615#include "stringlib/ucs2lib.h"
616#include "stringlib/fastsearch.h"
617#include "stringlib/partition.h"
618#include "stringlib/split.h"
619#include "stringlib/count.h"
620#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300621#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200622#include "stringlib/find_max_char.h"
623#include "stringlib/localeutil.h"
624#include "stringlib/undef.h"
625
626#include "stringlib/ucs4lib.h"
627#include "stringlib/fastsearch.h"
628#include "stringlib/partition.h"
629#include "stringlib/split.h"
630#include "stringlib/count.h"
631#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300632#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200633#include "stringlib/find_max_char.h"
634#include "stringlib/localeutil.h"
635#include "stringlib/undef.h"
636
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200637#include "stringlib/unicodedefs.h"
638#include "stringlib/fastsearch.h"
639#include "stringlib/count.h"
640#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100641#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200642
Guido van Rossumd57fd912000-03-10 22:53:23 +0000643/* --- Unicode Object ----------------------------------------------------- */
644
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200645static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200646fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200647
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200648Py_LOCAL_INLINE(Py_ssize_t) findchar(void *s, int kind,
649 Py_ssize_t size, Py_UCS4 ch,
650 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200651{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200652 int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH;
653
654 switch (kind) {
655 case PyUnicode_1BYTE_KIND:
656 {
657 Py_UCS1 ch1 = (Py_UCS1) ch;
658 if (ch1 == ch)
659 return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode);
660 else
661 return -1;
662 }
663 case PyUnicode_2BYTE_KIND:
664 {
665 Py_UCS2 ch2 = (Py_UCS2) ch;
666 if (ch2 == ch)
667 return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode);
668 else
669 return -1;
670 }
671 case PyUnicode_4BYTE_KIND:
672 return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode);
673 default:
674 assert(0);
675 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200676 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200677}
678
Victor Stinnerafffce42012-10-03 23:03:17 +0200679#ifdef Py_DEBUG
680/* Fill the data of an Unicode string with invalid characters to detect bugs
681 earlier.
682
683 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
684 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
685 invalid character in Unicode 6.0. */
686static void
687unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
688{
689 int kind = PyUnicode_KIND(unicode);
690 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
691 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
692 if (length <= old_length)
693 return;
694 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
695}
696#endif
697
Victor Stinnerfe226c02011-10-03 03:52:20 +0200698static PyObject*
699resize_compact(PyObject *unicode, Py_ssize_t length)
700{
701 Py_ssize_t char_size;
702 Py_ssize_t struct_size;
703 Py_ssize_t new_size;
704 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100705 PyObject *new_unicode;
Victor Stinnerafffce42012-10-03 23:03:17 +0200706#ifdef Py_DEBUG
707 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
708#endif
709
Victor Stinner79891572012-05-03 13:43:07 +0200710 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200711 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +0100712 assert(PyUnicode_IS_COMPACT(unicode));
713
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200714 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100715 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +0200716 struct_size = sizeof(PyASCIIObject);
717 else
718 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200719 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200720
Victor Stinnerfe226c02011-10-03 03:52:20 +0200721 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
722 PyErr_NoMemory();
723 return NULL;
724 }
725 new_size = (struct_size + (length + 1) * char_size);
726
Victor Stinner84def372011-12-11 20:04:56 +0100727 _Py_DEC_REFTOTAL;
728 _Py_ForgetReference(unicode);
729
730 new_unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
731 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100732 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200733 PyErr_NoMemory();
734 return NULL;
735 }
Victor Stinner84def372011-12-11 20:04:56 +0100736 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200737 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +0100738
Victor Stinnerfe226c02011-10-03 03:52:20 +0200739 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200740 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200741 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100742 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +0200743 _PyUnicode_WSTR_LENGTH(unicode) = length;
744 }
Victor Stinnerbbbac2e2013-02-07 23:12:46 +0100745 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
746 PyObject_DEL(_PyUnicode_WSTR(unicode));
747 _PyUnicode_WSTR(unicode) = NULL;
748 }
Victor Stinnerafffce42012-10-03 23:03:17 +0200749#ifdef Py_DEBUG
750 unicode_fill_invalid(unicode, old_length);
751#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200752 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
753 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +0200754 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200755 return unicode;
756}
757
Alexander Belopolsky40018472011-02-26 01:02:56 +0000758static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200759resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000760{
Victor Stinner95663112011-10-04 01:03:50 +0200761 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100762 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200763 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200764 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000765
Victor Stinnerfe226c02011-10-03 03:52:20 +0200766 if (PyUnicode_IS_READY(unicode)) {
767 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200768 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200769 void *data;
Victor Stinnerafffce42012-10-03 23:03:17 +0200770#ifdef Py_DEBUG
771 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
772#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200773
774 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200775 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200776 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
777 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200778
779 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
780 PyErr_NoMemory();
781 return -1;
782 }
783 new_size = (length + 1) * char_size;
784
Victor Stinner7a9105a2011-12-12 00:13:42 +0100785 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
786 {
787 PyObject_DEL(_PyUnicode_UTF8(unicode));
788 _PyUnicode_UTF8(unicode) = NULL;
789 _PyUnicode_UTF8_LENGTH(unicode) = 0;
790 }
791
Victor Stinnerfe226c02011-10-03 03:52:20 +0200792 data = (PyObject *)PyObject_REALLOC(data, new_size);
793 if (data == NULL) {
794 PyErr_NoMemory();
795 return -1;
796 }
797 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200798 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200799 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200800 _PyUnicode_WSTR_LENGTH(unicode) = length;
801 }
802 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200803 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200804 _PyUnicode_UTF8_LENGTH(unicode) = length;
805 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200806 _PyUnicode_LENGTH(unicode) = length;
807 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinnerafffce42012-10-03 23:03:17 +0200808#ifdef Py_DEBUG
809 unicode_fill_invalid(unicode, old_length);
810#endif
Victor Stinner95663112011-10-04 01:03:50 +0200811 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200812 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200813 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200814 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200815 }
Victor Stinner95663112011-10-04 01:03:50 +0200816 assert(_PyUnicode_WSTR(unicode) != NULL);
817
818 /* check for integer overflow */
819 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
820 PyErr_NoMemory();
821 return -1;
822 }
Victor Stinner7a9105a2011-12-12 00:13:42 +0100823 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +0200824 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +0100825 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +0200826 if (!wstr) {
827 PyErr_NoMemory();
828 return -1;
829 }
830 _PyUnicode_WSTR(unicode) = wstr;
831 _PyUnicode_WSTR(unicode)[length] = 0;
832 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200833 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000834 return 0;
835}
836
Victor Stinnerfe226c02011-10-03 03:52:20 +0200837static PyObject*
838resize_copy(PyObject *unicode, Py_ssize_t length)
839{
840 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100841 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200842 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100843
Benjamin Petersonbac79492012-01-14 13:34:47 -0500844 if (PyUnicode_READY(unicode) == -1)
Victor Stinner7a9105a2011-12-12 00:13:42 +0100845 return NULL;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200846
847 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
848 if (copy == NULL)
849 return NULL;
850
851 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +0200852 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200853 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200854 }
855 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200856 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100857
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200858 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200859 if (w == NULL)
860 return NULL;
861 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
862 copy_length = Py_MIN(copy_length, length);
Victor Stinnerc6cf1ba2012-10-23 02:54:47 +0200863 Py_MEMCPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
864 copy_length * sizeof(wchar_t));
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200865 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200866 }
867}
868
Guido van Rossumd57fd912000-03-10 22:53:23 +0000869/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000870 Ux0000 terminated; some code (e.g. new_identifier)
871 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000872
873 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000874 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000875
876*/
877
Alexander Belopolsky40018472011-02-26 01:02:56 +0000878static PyUnicodeObject *
879_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000880{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +0200881 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200882 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000883
Thomas Wouters477c8d52006-05-27 19:21:47 +0000884 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000885 if (length == 0 && unicode_empty != NULL) {
886 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200887 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000888 }
889
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000890 /* Ensure we won't overflow the size. */
891 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
892 return (PyUnicodeObject *)PyErr_NoMemory();
893 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200894 if (length < 0) {
895 PyErr_SetString(PyExc_SystemError,
896 "Negative size passed to _PyUnicode_New");
897 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000898 }
899
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200900 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
901 if (unicode == NULL)
902 return NULL;
903 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
Victor Stinner68b674c2013-10-29 19:31:43 +0100904
905 _PyUnicode_WSTR_LENGTH(unicode) = length;
906 _PyUnicode_HASH(unicode) = -1;
907 _PyUnicode_STATE(unicode).interned = 0;
908 _PyUnicode_STATE(unicode).kind = 0;
909 _PyUnicode_STATE(unicode).compact = 0;
910 _PyUnicode_STATE(unicode).ready = 0;
911 _PyUnicode_STATE(unicode).ascii = 0;
912 _PyUnicode_DATA_ANY(unicode) = NULL;
913 _PyUnicode_LENGTH(unicode) = 0;
914 _PyUnicode_UTF8(unicode) = NULL;
915 _PyUnicode_UTF8_LENGTH(unicode) = 0;
916
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200917 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
918 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100919 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +0000920 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100921 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000922 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200923
Jeremy Hyltond8082792003-09-16 19:41:39 +0000924 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000925 * the caller fails before initializing str -- unicode_resize()
926 * reads str[0], and the Keep-Alive optimization can keep memory
927 * allocated for str alive across a call to unicode_dealloc(unicode).
928 * We don't want unicode_resize to read uninitialized memory in
929 * that case.
930 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200931 _PyUnicode_WSTR(unicode)[0] = 0;
932 _PyUnicode_WSTR(unicode)[length] = 0;
Victor Stinner68b674c2013-10-29 19:31:43 +0100933
Victor Stinner7931d9a2011-11-04 00:22:48 +0100934 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000935 return unicode;
936}
937
Victor Stinnerf42dc442011-10-02 23:33:16 +0200938static const char*
939unicode_kind_name(PyObject *unicode)
940{
Victor Stinner42dfd712011-10-03 14:41:45 +0200941 /* don't check consistency: unicode_kind_name() is called from
942 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +0200943 if (!PyUnicode_IS_COMPACT(unicode))
944 {
945 if (!PyUnicode_IS_READY(unicode))
946 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -0600947 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200948 {
949 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200950 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200951 return "legacy ascii";
952 else
953 return "legacy latin1";
954 case PyUnicode_2BYTE_KIND:
955 return "legacy UCS2";
956 case PyUnicode_4BYTE_KIND:
957 return "legacy UCS4";
958 default:
959 return "<legacy invalid kind>";
960 }
961 }
962 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -0600963 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +0200964 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200965 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200966 return "ascii";
967 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200968 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200969 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200970 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200971 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200972 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200973 default:
974 return "<invalid compact kind>";
975 }
976}
977
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200978#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200979/* Functions wrapping macros for use in debugger */
980char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200981 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200982}
983
984void *_PyUnicode_compact_data(void *unicode) {
985 return _PyUnicode_COMPACT_DATA(unicode);
986}
987void *_PyUnicode_data(void *unicode){
988 printf("obj %p\n", unicode);
989 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
990 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
991 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
992 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
993 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
994 return PyUnicode_DATA(unicode);
995}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200996
997void
998_PyUnicode_Dump(PyObject *op)
999{
1000 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +02001001 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1002 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
1003 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +02001004
Victor Stinnera849a4b2011-10-03 12:12:11 +02001005 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +02001006 {
1007 if (ascii->state.ascii)
1008 data = (ascii + 1);
1009 else
1010 data = (compact + 1);
1011 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001012 else
1013 data = unicode->data.any;
Victor Stinner0d60e872011-10-23 19:47:19 +02001014 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
1015
Victor Stinnera849a4b2011-10-03 12:12:11 +02001016 if (ascii->wstr == data)
1017 printf("shared ");
1018 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +02001019
Victor Stinnera3b334d2011-10-03 13:53:37 +02001020 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnera849a4b2011-10-03 12:12:11 +02001021 printf(" (%zu), ", compact->wstr_length);
1022 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
1023 printf("shared ");
1024 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001025 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001026 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001027}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001028#endif
1029
1030PyObject *
1031PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1032{
1033 PyObject *obj;
1034 PyCompactUnicodeObject *unicode;
1035 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +02001036 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001037 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001038 Py_ssize_t char_size;
1039 Py_ssize_t struct_size;
1040
1041 /* Optimization for empty strings */
1042 if (size == 0 && unicode_empty != NULL) {
1043 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001044 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001045 }
1046
Victor Stinner9e9d6892011-10-04 01:02:02 +02001047 is_ascii = 0;
1048 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001049 struct_size = sizeof(PyCompactUnicodeObject);
1050 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +02001051 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001052 char_size = 1;
1053 is_ascii = 1;
1054 struct_size = sizeof(PyASCIIObject);
1055 }
1056 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001057 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001058 char_size = 1;
1059 }
1060 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001061 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001062 char_size = 2;
1063 if (sizeof(wchar_t) == 2)
1064 is_sharing = 1;
1065 }
1066 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001067 if (maxchar > MAX_UNICODE) {
1068 PyErr_SetString(PyExc_SystemError,
1069 "invalid maximum character passed to PyUnicode_New");
1070 return NULL;
1071 }
Victor Stinner8f825062012-04-27 13:55:39 +02001072 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001073 char_size = 4;
1074 if (sizeof(wchar_t) == 4)
1075 is_sharing = 1;
1076 }
1077
1078 /* Ensure we won't overflow the size. */
1079 if (size < 0) {
1080 PyErr_SetString(PyExc_SystemError,
1081 "Negative size passed to PyUnicode_New");
1082 return NULL;
1083 }
1084 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1085 return PyErr_NoMemory();
1086
1087 /* Duplicated allocation code from _PyObject_New() instead of a call to
1088 * PyObject_New() so we are able to allocate space for the object and
1089 * it's data buffer.
1090 */
1091 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1092 if (obj == NULL)
1093 return PyErr_NoMemory();
1094 obj = PyObject_INIT(obj, &PyUnicode_Type);
1095 if (obj == NULL)
1096 return NULL;
1097
1098 unicode = (PyCompactUnicodeObject *)obj;
1099 if (is_ascii)
1100 data = ((PyASCIIObject*)obj) + 1;
1101 else
1102 data = unicode + 1;
1103 _PyUnicode_LENGTH(unicode) = size;
1104 _PyUnicode_HASH(unicode) = -1;
1105 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001106 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001107 _PyUnicode_STATE(unicode).compact = 1;
1108 _PyUnicode_STATE(unicode).ready = 1;
1109 _PyUnicode_STATE(unicode).ascii = is_ascii;
1110 if (is_ascii) {
1111 ((char*)data)[size] = 0;
1112 _PyUnicode_WSTR(unicode) = NULL;
1113 }
Victor Stinner8f825062012-04-27 13:55:39 +02001114 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001115 ((char*)data)[size] = 0;
1116 _PyUnicode_WSTR(unicode) = NULL;
1117 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001118 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001119 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001120 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001121 else {
1122 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001123 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001124 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001125 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001126 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001127 ((Py_UCS4*)data)[size] = 0;
1128 if (is_sharing) {
1129 _PyUnicode_WSTR_LENGTH(unicode) = size;
1130 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1131 }
1132 else {
1133 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1134 _PyUnicode_WSTR(unicode) = NULL;
1135 }
1136 }
Victor Stinner8f825062012-04-27 13:55:39 +02001137#ifdef Py_DEBUG
Victor Stinnerafffce42012-10-03 23:03:17 +02001138 unicode_fill_invalid((PyObject*)unicode, 0);
Victor Stinner8f825062012-04-27 13:55:39 +02001139#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001140 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001141 return obj;
1142}
1143
1144#if SIZEOF_WCHAR_T == 2
1145/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1146 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001147 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001148
1149 This function assumes that unicode can hold one more code point than wstr
1150 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001151static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001152unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001153 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001154{
1155 const wchar_t *iter;
1156 Py_UCS4 *ucs4_out;
1157
Victor Stinner910337b2011-10-03 03:20:16 +02001158 assert(unicode != NULL);
1159 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001160 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1161 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1162
1163 for (iter = begin; iter < end; ) {
1164 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1165 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001166 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1167 && (iter+1) < end
1168 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001169 {
Victor Stinner551ac952011-11-29 22:58:13 +01001170 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001171 iter += 2;
1172 }
1173 else {
1174 *ucs4_out++ = *iter;
1175 iter++;
1176 }
1177 }
1178 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1179 _PyUnicode_GET_LENGTH(unicode)));
1180
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001181}
1182#endif
1183
Victor Stinnercd9950f2011-10-02 00:34:53 +02001184static int
Victor Stinner488fa492011-12-12 00:01:39 +01001185unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001186{
Victor Stinner488fa492011-12-12 00:01:39 +01001187 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001188 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001189 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001190 return -1;
1191 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001192 return 0;
1193}
1194
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001195static int
1196_copy_characters(PyObject *to, Py_ssize_t to_start,
1197 PyObject *from, Py_ssize_t from_start,
1198 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001199{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001200 unsigned int from_kind, to_kind;
1201 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001202
Victor Stinneree4544c2012-05-09 22:24:08 +02001203 assert(0 <= how_many);
1204 assert(0 <= from_start);
1205 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001206 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001207 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001208 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001209
Victor Stinnerd3f08822012-05-29 12:57:52 +02001210 assert(PyUnicode_Check(to));
1211 assert(PyUnicode_IS_READY(to));
1212 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1213
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001214 if (how_many == 0)
1215 return 0;
1216
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001217 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001218 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001219 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001220 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001221
Victor Stinnerf1852262012-06-16 16:38:26 +02001222#ifdef Py_DEBUG
1223 if (!check_maxchar
1224 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1225 {
1226 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1227 Py_UCS4 ch;
1228 Py_ssize_t i;
1229 for (i=0; i < how_many; i++) {
1230 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1231 assert(ch <= to_maxchar);
1232 }
1233 }
1234#endif
1235
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001236 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001237 if (check_maxchar
1238 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1239 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001240 /* Writing Latin-1 characters into an ASCII string requires to
1241 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001242 Py_UCS4 max_char;
1243 max_char = ucs1lib_find_max_char(from_data,
1244 (Py_UCS1*)from_data + how_many);
1245 if (max_char >= 128)
1246 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001247 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001248 Py_MEMCPY((char*)to_data + to_kind * to_start,
1249 (char*)from_data + from_kind * from_start,
1250 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001251 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001252 else if (from_kind == PyUnicode_1BYTE_KIND
1253 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001254 {
1255 _PyUnicode_CONVERT_BYTES(
1256 Py_UCS1, Py_UCS2,
1257 PyUnicode_1BYTE_DATA(from) + from_start,
1258 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1259 PyUnicode_2BYTE_DATA(to) + to_start
1260 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001261 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001262 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001263 && to_kind == PyUnicode_4BYTE_KIND)
1264 {
1265 _PyUnicode_CONVERT_BYTES(
1266 Py_UCS1, Py_UCS4,
1267 PyUnicode_1BYTE_DATA(from) + from_start,
1268 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1269 PyUnicode_4BYTE_DATA(to) + to_start
1270 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001271 }
1272 else if (from_kind == PyUnicode_2BYTE_KIND
1273 && to_kind == PyUnicode_4BYTE_KIND)
1274 {
1275 _PyUnicode_CONVERT_BYTES(
1276 Py_UCS2, Py_UCS4,
1277 PyUnicode_2BYTE_DATA(from) + from_start,
1278 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1279 PyUnicode_4BYTE_DATA(to) + to_start
1280 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001281 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001282 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001283 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1284
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001285 if (!check_maxchar) {
1286 if (from_kind == PyUnicode_2BYTE_KIND
1287 && to_kind == PyUnicode_1BYTE_KIND)
1288 {
1289 _PyUnicode_CONVERT_BYTES(
1290 Py_UCS2, Py_UCS1,
1291 PyUnicode_2BYTE_DATA(from) + from_start,
1292 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1293 PyUnicode_1BYTE_DATA(to) + to_start
1294 );
1295 }
1296 else if (from_kind == PyUnicode_4BYTE_KIND
1297 && to_kind == PyUnicode_1BYTE_KIND)
1298 {
1299 _PyUnicode_CONVERT_BYTES(
1300 Py_UCS4, Py_UCS1,
1301 PyUnicode_4BYTE_DATA(from) + from_start,
1302 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1303 PyUnicode_1BYTE_DATA(to) + to_start
1304 );
1305 }
1306 else if (from_kind == PyUnicode_4BYTE_KIND
1307 && to_kind == PyUnicode_2BYTE_KIND)
1308 {
1309 _PyUnicode_CONVERT_BYTES(
1310 Py_UCS4, Py_UCS2,
1311 PyUnicode_4BYTE_DATA(from) + from_start,
1312 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1313 PyUnicode_2BYTE_DATA(to) + to_start
1314 );
1315 }
1316 else {
1317 assert(0);
1318 return -1;
1319 }
1320 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001321 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001322 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001323 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001324 Py_ssize_t i;
1325
Victor Stinnera0702ab2011-09-29 14:14:38 +02001326 for (i=0; i < how_many; i++) {
1327 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001328 if (ch > to_maxchar)
1329 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001330 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1331 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001332 }
1333 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001334 return 0;
1335}
1336
Victor Stinnerd3f08822012-05-29 12:57:52 +02001337void
1338_PyUnicode_FastCopyCharacters(
1339 PyObject *to, Py_ssize_t to_start,
1340 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001341{
1342 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1343}
1344
1345Py_ssize_t
1346PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1347 PyObject *from, Py_ssize_t from_start,
1348 Py_ssize_t how_many)
1349{
1350 int err;
1351
1352 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1353 PyErr_BadInternalCall();
1354 return -1;
1355 }
1356
Benjamin Petersonbac79492012-01-14 13:34:47 -05001357 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001358 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001359 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001360 return -1;
1361
Victor Stinnerd3f08822012-05-29 12:57:52 +02001362 if (from_start < 0) {
1363 PyErr_SetString(PyExc_IndexError, "string index out of range");
1364 return -1;
1365 }
1366 if (to_start < 0) {
1367 PyErr_SetString(PyExc_IndexError, "string index out of range");
1368 return -1;
1369 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001370 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1371 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1372 PyErr_Format(PyExc_SystemError,
1373 "Cannot write %zi characters at %zi "
1374 "in a string of %zi characters",
1375 how_many, to_start, PyUnicode_GET_LENGTH(to));
1376 return -1;
1377 }
1378
1379 if (how_many == 0)
1380 return 0;
1381
Victor Stinner488fa492011-12-12 00:01:39 +01001382 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001383 return -1;
1384
1385 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1386 if (err) {
1387 PyErr_Format(PyExc_SystemError,
1388 "Cannot copy %s characters "
1389 "into a string of %s characters",
1390 unicode_kind_name(from),
1391 unicode_kind_name(to));
1392 return -1;
1393 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001394 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001395}
1396
Victor Stinner17222162011-09-28 22:15:37 +02001397/* Find the maximum code point and count the number of surrogate pairs so a
1398 correct string length can be computed before converting a string to UCS4.
1399 This function counts single surrogates as a character and not as a pair.
1400
1401 Return 0 on success, or -1 on error. */
1402static int
1403find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1404 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001405{
1406 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001407 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001408
Victor Stinnerc53be962011-10-02 21:33:54 +02001409 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001410 *num_surrogates = 0;
1411 *maxchar = 0;
1412
1413 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001414#if SIZEOF_WCHAR_T == 2
Victor Stinnercf77da92013-03-06 01:09:24 +01001415 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1416 && (iter+1) < end
1417 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1418 {
1419 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1420 ++(*num_surrogates);
1421 iter += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001422 }
1423 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001424#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001425 {
1426 ch = *iter;
1427 iter++;
1428 }
1429 if (ch > *maxchar) {
1430 *maxchar = ch;
1431 if (*maxchar > MAX_UNICODE) {
1432 PyErr_Format(PyExc_ValueError,
1433 "character U+%x is not in range [U+0000; U+10ffff]",
1434 ch);
1435 return -1;
1436 }
1437 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001438 }
1439 return 0;
1440}
1441
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001442int
1443_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001444{
1445 wchar_t *end;
1446 Py_UCS4 maxchar = 0;
1447 Py_ssize_t num_surrogates;
1448#if SIZEOF_WCHAR_T == 2
1449 Py_ssize_t length_wo_surrogates;
1450#endif
1451
Georg Brandl7597add2011-10-05 16:36:47 +02001452 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001453 strings were created using _PyObject_New() and where no canonical
1454 representation (the str field) has been set yet aka strings
1455 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001456 assert(_PyUnicode_CHECK(unicode));
1457 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001458 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001459 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001460 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001461 /* Actually, it should neither be interned nor be anything else: */
1462 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001463
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001464 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001465 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001466 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001467 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001468
1469 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001470 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1471 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001472 PyErr_NoMemory();
1473 return -1;
1474 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001475 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001476 _PyUnicode_WSTR(unicode), end,
1477 PyUnicode_1BYTE_DATA(unicode));
1478 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1479 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1480 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1481 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001482 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001483 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001484 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001485 }
1486 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001487 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001488 _PyUnicode_UTF8(unicode) = NULL;
1489 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001490 }
1491 PyObject_FREE(_PyUnicode_WSTR(unicode));
1492 _PyUnicode_WSTR(unicode) = NULL;
1493 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1494 }
1495 /* In this case we might have to convert down from 4-byte native
1496 wchar_t to 2-byte unicode. */
1497 else if (maxchar < 65536) {
1498 assert(num_surrogates == 0 &&
1499 "FindMaxCharAndNumSurrogatePairs() messed up");
1500
Victor Stinner506f5922011-09-28 22:34:18 +02001501#if SIZEOF_WCHAR_T == 2
1502 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001503 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001504 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1505 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1506 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001507 _PyUnicode_UTF8(unicode) = NULL;
1508 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001509#else
1510 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001511 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001512 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001513 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001514 PyErr_NoMemory();
1515 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001516 }
Victor Stinner506f5922011-09-28 22:34:18 +02001517 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1518 _PyUnicode_WSTR(unicode), end,
1519 PyUnicode_2BYTE_DATA(unicode));
1520 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1521 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1522 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001523 _PyUnicode_UTF8(unicode) = NULL;
1524 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001525 PyObject_FREE(_PyUnicode_WSTR(unicode));
1526 _PyUnicode_WSTR(unicode) = NULL;
1527 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1528#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001529 }
1530 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1531 else {
1532#if SIZEOF_WCHAR_T == 2
1533 /* in case the native representation is 2-bytes, we need to allocate a
1534 new normalized 4-byte version. */
1535 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001536 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1537 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001538 PyErr_NoMemory();
1539 return -1;
1540 }
1541 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1542 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001543 _PyUnicode_UTF8(unicode) = NULL;
1544 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001545 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1546 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001547 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001548 PyObject_FREE(_PyUnicode_WSTR(unicode));
1549 _PyUnicode_WSTR(unicode) = NULL;
1550 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1551#else
1552 assert(num_surrogates == 0);
1553
Victor Stinnerc3c74152011-10-02 20:39:55 +02001554 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001555 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001556 _PyUnicode_UTF8(unicode) = NULL;
1557 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001558 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1559#endif
1560 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1561 }
1562 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001563 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001564 return 0;
1565}
1566
Alexander Belopolsky40018472011-02-26 01:02:56 +00001567static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001568unicode_dealloc(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001569{
Walter Dörwald16807132007-05-25 13:52:07 +00001570 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001571 case SSTATE_NOT_INTERNED:
1572 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001573
Benjamin Peterson29060642009-01-31 22:14:21 +00001574 case SSTATE_INTERNED_MORTAL:
1575 /* revive dead object temporarily for DelItem */
1576 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001577 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001578 Py_FatalError(
1579 "deletion of interned string failed");
1580 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001581
Benjamin Peterson29060642009-01-31 22:14:21 +00001582 case SSTATE_INTERNED_IMMORTAL:
1583 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001584
Benjamin Peterson29060642009-01-31 22:14:21 +00001585 default:
1586 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001587 }
1588
Victor Stinner03490912011-10-03 23:45:12 +02001589 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001590 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001591 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001592 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001593 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1594 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001595
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001596 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001597}
1598
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001599#ifdef Py_DEBUG
1600static int
1601unicode_is_singleton(PyObject *unicode)
1602{
1603 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1604 if (unicode == unicode_empty)
1605 return 1;
1606 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1607 {
1608 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1609 if (ch < 256 && unicode_latin1[ch] == unicode)
1610 return 1;
1611 }
1612 return 0;
1613}
1614#endif
1615
Alexander Belopolsky40018472011-02-26 01:02:56 +00001616static int
Victor Stinner488fa492011-12-12 00:01:39 +01001617unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001618{
Victor Stinner488fa492011-12-12 00:01:39 +01001619 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001620 if (Py_REFCNT(unicode) != 1)
1621 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001622 if (_PyUnicode_HASH(unicode) != -1)
1623 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001624 if (PyUnicode_CHECK_INTERNED(unicode))
1625 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001626 if (!PyUnicode_CheckExact(unicode))
1627 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001628#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001629 /* singleton refcount is greater than 1 */
1630 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001631#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001632 return 1;
1633}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001634
Victor Stinnerfe226c02011-10-03 03:52:20 +02001635static int
1636unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1637{
1638 PyObject *unicode;
1639 Py_ssize_t old_length;
1640
1641 assert(p_unicode != NULL);
1642 unicode = *p_unicode;
1643
1644 assert(unicode != NULL);
1645 assert(PyUnicode_Check(unicode));
1646 assert(0 <= length);
1647
Victor Stinner910337b2011-10-03 03:20:16 +02001648 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001649 old_length = PyUnicode_WSTR_LENGTH(unicode);
1650 else
1651 old_length = PyUnicode_GET_LENGTH(unicode);
1652 if (old_length == length)
1653 return 0;
1654
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001655 if (length == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02001656 _Py_INCREF_UNICODE_EMPTY();
1657 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00001658 return -1;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001659 Py_DECREF(*p_unicode);
1660 *p_unicode = unicode_empty;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001661 return 0;
1662 }
1663
Victor Stinner488fa492011-12-12 00:01:39 +01001664 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001665 PyObject *copy = resize_copy(unicode, length);
1666 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001667 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001668 Py_DECREF(*p_unicode);
1669 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001670 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001671 }
1672
Victor Stinnerfe226c02011-10-03 03:52:20 +02001673 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001674 PyObject *new_unicode = resize_compact(unicode, length);
1675 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001676 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001677 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001678 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001679 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001680 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001681}
1682
Alexander Belopolsky40018472011-02-26 01:02:56 +00001683int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001684PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001685{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001686 PyObject *unicode;
1687 if (p_unicode == NULL) {
1688 PyErr_BadInternalCall();
1689 return -1;
1690 }
1691 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001692 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001693 {
1694 PyErr_BadInternalCall();
1695 return -1;
1696 }
1697 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001698}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001699
Victor Stinnerc5166102012-02-22 13:55:02 +01001700/* Copy a ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01001701
Victor Stinnerb429d3b2012-02-22 21:22:20 +01001702 WARNING: The function doesn't copy the terminating null character and
1703 doesn't check the maximum character (may write a latin1 character in an
1704 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02001705static void
1706unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1707 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01001708{
1709 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1710 void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02001711 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01001712
1713 switch (kind) {
1714 case PyUnicode_1BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01001715 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner8c6db452012-10-06 00:40:45 +02001716#ifdef Py_DEBUG
1717 if (PyUnicode_IS_ASCII(unicode)) {
1718 Py_UCS4 maxchar = ucs1lib_find_max_char(
1719 (const Py_UCS1*)str,
1720 (const Py_UCS1*)str + len);
1721 assert(maxchar < 128);
1722 }
1723#endif
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01001724 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02001725 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001726 }
1727 case PyUnicode_2BYTE_KIND: {
1728 Py_UCS2 *start = (Py_UCS2 *)data + index;
1729 Py_UCS2 *ucs2 = start;
1730 assert(index <= PyUnicode_GET_LENGTH(unicode));
1731
Victor Stinner184252a2012-06-16 02:57:41 +02001732 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001733 *ucs2 = (Py_UCS2)*str;
1734
1735 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02001736 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001737 }
1738 default: {
1739 Py_UCS4 *start = (Py_UCS4 *)data + index;
1740 Py_UCS4 *ucs4 = start;
1741 assert(kind == PyUnicode_4BYTE_KIND);
1742 assert(index <= PyUnicode_GET_LENGTH(unicode));
1743
Victor Stinner184252a2012-06-16 02:57:41 +02001744 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001745 *ucs4 = (Py_UCS4)*str;
1746
1747 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01001748 }
1749 }
1750}
1751
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001752static PyObject*
1753get_latin1_char(unsigned char ch)
1754{
Victor Stinnera464fc12011-10-02 20:39:30 +02001755 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001756 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001757 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001758 if (!unicode)
1759 return NULL;
1760 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001761 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001762 unicode_latin1[ch] = unicode;
1763 }
1764 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001765 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001766}
1767
Victor Stinner985a82a2014-01-03 12:53:47 +01001768static PyObject*
1769unicode_char(Py_UCS4 ch)
1770{
1771 PyObject *unicode;
1772
1773 assert(ch <= MAX_UNICODE);
1774
Victor Stinnerf3b46b42014-01-03 13:16:00 +01001775 if (ch < 256)
1776 return get_latin1_char(ch);
1777
Victor Stinner985a82a2014-01-03 12:53:47 +01001778 unicode = PyUnicode_New(1, ch);
1779 if (unicode == NULL)
1780 return NULL;
1781 switch (PyUnicode_KIND(unicode)) {
1782 case PyUnicode_1BYTE_KIND:
1783 PyUnicode_1BYTE_DATA(unicode)[0] = (Py_UCS1)ch;
1784 break;
1785 case PyUnicode_2BYTE_KIND:
1786 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
1787 break;
1788 default:
1789 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1790 PyUnicode_4BYTE_DATA(unicode)[0] = ch;
1791 }
1792 assert(_PyUnicode_CheckConsistency(unicode, 1));
1793 return unicode;
1794}
1795
Alexander Belopolsky40018472011-02-26 01:02:56 +00001796PyObject *
1797PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001798{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001799 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001800 Py_UCS4 maxchar = 0;
1801 Py_ssize_t num_surrogates;
1802
1803 if (u == NULL)
1804 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001805
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001806 /* If the Unicode data is known at construction time, we can apply
1807 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001808
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001809 /* Optimization for empty strings */
Serhiy Storchaka678db842013-01-26 12:16:36 +02001810 if (size == 0)
1811 _Py_RETURN_UNICODE_EMPTY();
Tim Petersced69f82003-09-16 20:30:58 +00001812
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001813 /* Single character Unicode objects in the Latin-1 range are
1814 shared when using this constructor */
Victor Stinnerd21b58c2013-02-26 00:15:54 +01001815 if (size == 1 && (Py_UCS4)*u < 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001816 return get_latin1_char((unsigned char)*u);
1817
1818 /* If not empty and not single character, copy the Unicode data
1819 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001820 if (find_maxchar_surrogates(u, u + size,
1821 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001822 return NULL;
1823
Victor Stinner8faf8212011-12-08 22:14:11 +01001824 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001825 if (!unicode)
1826 return NULL;
1827
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001828 switch (PyUnicode_KIND(unicode)) {
1829 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001830 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001831 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1832 break;
1833 case PyUnicode_2BYTE_KIND:
1834#if Py_UNICODE_SIZE == 2
1835 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1836#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001837 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001838 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1839#endif
1840 break;
1841 case PyUnicode_4BYTE_KIND:
1842#if SIZEOF_WCHAR_T == 2
1843 /* This is the only case which has to process surrogates, thus
1844 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001845 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001846#else
1847 assert(num_surrogates == 0);
1848 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1849#endif
1850 break;
1851 default:
1852 assert(0 && "Impossible state");
1853 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001854
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001855 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001856}
1857
Alexander Belopolsky40018472011-02-26 01:02:56 +00001858PyObject *
1859PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001860{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001861 if (size < 0) {
1862 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001863 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001864 return NULL;
1865 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001866 if (u != NULL)
1867 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
1868 else
1869 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001870}
1871
Alexander Belopolsky40018472011-02-26 01:02:56 +00001872PyObject *
1873PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001874{
1875 size_t size = strlen(u);
1876 if (size > PY_SSIZE_T_MAX) {
1877 PyErr_SetString(PyExc_OverflowError, "input too long");
1878 return NULL;
1879 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001880 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001881}
1882
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001883PyObject *
1884_PyUnicode_FromId(_Py_Identifier *id)
1885{
1886 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01001887 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
1888 strlen(id->string),
1889 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001890 if (!id->object)
1891 return NULL;
1892 PyUnicode_InternInPlace(&id->object);
1893 assert(!id->next);
1894 id->next = static_strings;
1895 static_strings = id;
1896 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001897 return id->object;
1898}
1899
1900void
1901_PyUnicode_ClearStaticStrings()
1902{
Benjamin Peterson0c270a82013-01-09 09:52:01 -06001903 _Py_Identifier *tmp, *s = static_strings;
1904 while (s) {
1905 Py_DECREF(s->object);
1906 s->object = NULL;
1907 tmp = s->next;
1908 s->next = NULL;
1909 s = tmp;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001910 }
Benjamin Peterson0c270a82013-01-09 09:52:01 -06001911 static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001912}
1913
Benjamin Peterson0df54292012-03-26 14:50:32 -04001914/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001915
Victor Stinnerd3f08822012-05-29 12:57:52 +02001916PyObject*
1917_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02001918{
Victor Stinnerd3f08822012-05-29 12:57:52 +02001919 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01001920 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01001921 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02001922#ifdef Py_DEBUG
Victor Stinnerd21b58c2013-02-26 00:15:54 +01001923 assert((unsigned char)s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02001924#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001925 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01001926 }
Victor Stinner785938e2011-12-11 20:09:03 +01001927 unicode = PyUnicode_New(size, 127);
1928 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02001929 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01001930 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
1931 assert(_PyUnicode_CheckConsistency(unicode, 1));
1932 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02001933}
1934
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001935static Py_UCS4
1936kind_maxchar_limit(unsigned int kind)
1937{
Benjamin Petersonead6b532011-12-20 17:23:42 -06001938 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001939 case PyUnicode_1BYTE_KIND:
1940 return 0x80;
1941 case PyUnicode_2BYTE_KIND:
1942 return 0x100;
1943 case PyUnicode_4BYTE_KIND:
1944 return 0x10000;
1945 default:
1946 assert(0 && "invalid kind");
Victor Stinner8faf8212011-12-08 22:14:11 +01001947 return MAX_UNICODE;
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001948 }
1949}
1950
Victor Stinnere6abb482012-05-02 01:15:40 +02001951Py_LOCAL_INLINE(Py_UCS4)
1952align_maxchar(Py_UCS4 maxchar)
1953{
1954 if (maxchar <= 127)
1955 return 127;
1956 else if (maxchar <= 255)
1957 return 255;
1958 else if (maxchar <= 65535)
1959 return 65535;
1960 else
1961 return MAX_UNICODE;
1962}
1963
Victor Stinner702c7342011-10-05 13:50:52 +02001964static PyObject*
Victor Stinnerd21b58c2013-02-26 00:15:54 +01001965_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001966{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001967 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001968 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001969
Serhiy Storchaka678db842013-01-26 12:16:36 +02001970 if (size == 0)
1971 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001972 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001973 if (size == 1)
1974 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001975
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001976 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001977 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001978 if (!res)
1979 return NULL;
1980 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001981 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001982 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001983}
1984
Victor Stinnere57b1c02011-09-28 22:20:48 +02001985static PyObject*
1986_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001987{
1988 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001989 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001990
Serhiy Storchaka678db842013-01-26 12:16:36 +02001991 if (size == 0)
1992 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001993 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01001994 if (size == 1)
1995 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001996
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001997 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001998 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001999 if (!res)
2000 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002001 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002002 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002003 else {
2004 _PyUnicode_CONVERT_BYTES(
2005 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2006 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002007 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002008 return res;
2009}
2010
Victor Stinnere57b1c02011-09-28 22:20:48 +02002011static PyObject*
2012_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002013{
2014 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002015 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002016
Serhiy Storchaka678db842013-01-26 12:16:36 +02002017 if (size == 0)
2018 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002019 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002020 if (size == 1)
2021 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002022
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002023 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002024 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002025 if (!res)
2026 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002027 if (max_char < 256)
2028 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2029 PyUnicode_1BYTE_DATA(res));
2030 else if (max_char < 0x10000)
2031 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2032 PyUnicode_2BYTE_DATA(res));
2033 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002034 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002035 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002036 return res;
2037}
2038
2039PyObject*
2040PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2041{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002042 if (size < 0) {
2043 PyErr_SetString(PyExc_ValueError, "size must be positive");
2044 return NULL;
2045 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002046 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002047 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002048 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002049 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002050 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002051 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002052 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002053 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002054 PyErr_SetString(PyExc_SystemError, "invalid kind");
2055 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002056 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002057}
2058
Victor Stinnerece58de2012-04-23 23:36:38 +02002059Py_UCS4
2060_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2061{
2062 enum PyUnicode_Kind kind;
2063 void *startptr, *endptr;
2064
2065 assert(PyUnicode_IS_READY(unicode));
2066 assert(0 <= start);
2067 assert(end <= PyUnicode_GET_LENGTH(unicode));
2068 assert(start <= end);
2069
2070 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2071 return PyUnicode_MAX_CHAR_VALUE(unicode);
2072
2073 if (start == end)
2074 return 127;
2075
Victor Stinner94d558b2012-04-27 22:26:58 +02002076 if (PyUnicode_IS_ASCII(unicode))
2077 return 127;
2078
Victor Stinnerece58de2012-04-23 23:36:38 +02002079 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002080 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002081 endptr = (char *)startptr + end * kind;
2082 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002083 switch(kind) {
2084 case PyUnicode_1BYTE_KIND:
2085 return ucs1lib_find_max_char(startptr, endptr);
2086 case PyUnicode_2BYTE_KIND:
2087 return ucs2lib_find_max_char(startptr, endptr);
2088 case PyUnicode_4BYTE_KIND:
2089 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002090 default:
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002091 assert(0);
2092 return 0;
Victor Stinnerece58de2012-04-23 23:36:38 +02002093 }
2094}
2095
Victor Stinner25a4b292011-10-06 12:31:55 +02002096/* Ensure that a string uses the most efficient storage, if it is not the
2097 case: create a new string with of the right kind. Write NULL into *p_unicode
2098 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002099static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002100unicode_adjust_maxchar(PyObject **p_unicode)
2101{
2102 PyObject *unicode, *copy;
2103 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002104 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002105 unsigned int kind;
2106
2107 assert(p_unicode != NULL);
2108 unicode = *p_unicode;
2109 assert(PyUnicode_IS_READY(unicode));
2110 if (PyUnicode_IS_ASCII(unicode))
2111 return;
2112
2113 len = PyUnicode_GET_LENGTH(unicode);
2114 kind = PyUnicode_KIND(unicode);
2115 if (kind == PyUnicode_1BYTE_KIND) {
2116 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002117 max_char = ucs1lib_find_max_char(u, u + len);
2118 if (max_char >= 128)
2119 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002120 }
2121 else if (kind == PyUnicode_2BYTE_KIND) {
2122 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002123 max_char = ucs2lib_find_max_char(u, u + len);
2124 if (max_char >= 256)
2125 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002126 }
2127 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002128 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02002129 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002130 max_char = ucs4lib_find_max_char(u, u + len);
2131 if (max_char >= 0x10000)
2132 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002133 }
Victor Stinner25a4b292011-10-06 12:31:55 +02002134 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002135 if (copy != NULL)
2136 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002137 Py_DECREF(unicode);
2138 *p_unicode = copy;
2139}
2140
Victor Stinner034f6cf2011-09-30 02:26:44 +02002141PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002142_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002143{
Victor Stinner87af4f22011-11-21 23:03:47 +01002144 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002145 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002146
Victor Stinner034f6cf2011-09-30 02:26:44 +02002147 if (!PyUnicode_Check(unicode)) {
2148 PyErr_BadInternalCall();
2149 return NULL;
2150 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002151 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002152 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002153
Victor Stinner87af4f22011-11-21 23:03:47 +01002154 length = PyUnicode_GET_LENGTH(unicode);
2155 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002156 if (!copy)
2157 return NULL;
2158 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2159
Victor Stinner87af4f22011-11-21 23:03:47 +01002160 Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2161 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002162 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002163 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002164}
2165
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002166
Victor Stinnerbc603d12011-10-02 01:00:40 +02002167/* Widen Unicode objects to larger buffers. Don't write terminating null
2168 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002169
2170void*
2171_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2172{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002173 Py_ssize_t len;
2174 void *result;
2175 unsigned int skind;
2176
Benjamin Petersonbac79492012-01-14 13:34:47 -05002177 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02002178 return NULL;
2179
2180 len = PyUnicode_GET_LENGTH(s);
2181 skind = PyUnicode_KIND(s);
2182 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002183 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002184 return NULL;
2185 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002186 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002187 case PyUnicode_2BYTE_KIND:
2188 result = PyMem_Malloc(len * sizeof(Py_UCS2));
2189 if (!result)
2190 return PyErr_NoMemory();
2191 assert(skind == PyUnicode_1BYTE_KIND);
2192 _PyUnicode_CONVERT_BYTES(
2193 Py_UCS1, Py_UCS2,
2194 PyUnicode_1BYTE_DATA(s),
2195 PyUnicode_1BYTE_DATA(s) + len,
2196 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002197 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002198 case PyUnicode_4BYTE_KIND:
2199 result = PyMem_Malloc(len * sizeof(Py_UCS4));
2200 if (!result)
2201 return PyErr_NoMemory();
2202 if (skind == PyUnicode_2BYTE_KIND) {
2203 _PyUnicode_CONVERT_BYTES(
2204 Py_UCS2, Py_UCS4,
2205 PyUnicode_2BYTE_DATA(s),
2206 PyUnicode_2BYTE_DATA(s) + len,
2207 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002208 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002209 else {
2210 assert(skind == PyUnicode_1BYTE_KIND);
2211 _PyUnicode_CONVERT_BYTES(
2212 Py_UCS1, Py_UCS4,
2213 PyUnicode_1BYTE_DATA(s),
2214 PyUnicode_1BYTE_DATA(s) + len,
2215 result);
2216 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002217 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002218 default:
2219 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002220 }
Victor Stinner01698042011-10-04 00:04:26 +02002221 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002222 return NULL;
2223}
2224
2225static Py_UCS4*
2226as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2227 int copy_null)
2228{
2229 int kind;
2230 void *data;
2231 Py_ssize_t len, targetlen;
2232 if (PyUnicode_READY(string) == -1)
2233 return NULL;
2234 kind = PyUnicode_KIND(string);
2235 data = PyUnicode_DATA(string);
2236 len = PyUnicode_GET_LENGTH(string);
2237 targetlen = len;
2238 if (copy_null)
2239 targetlen++;
2240 if (!target) {
2241 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
2242 PyErr_NoMemory();
2243 return NULL;
2244 }
2245 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
2246 if (!target) {
2247 PyErr_NoMemory();
2248 return NULL;
2249 }
2250 }
2251 else {
2252 if (targetsize < targetlen) {
2253 PyErr_Format(PyExc_SystemError,
2254 "string is longer than the buffer");
2255 if (copy_null && 0 < targetsize)
2256 target[0] = 0;
2257 return NULL;
2258 }
2259 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002260 if (kind == PyUnicode_1BYTE_KIND) {
2261 Py_UCS1 *start = (Py_UCS1 *) data;
2262 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002263 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002264 else if (kind == PyUnicode_2BYTE_KIND) {
2265 Py_UCS2 *start = (Py_UCS2 *) data;
2266 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2267 }
2268 else {
2269 assert(kind == PyUnicode_4BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002270 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002271 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002272 if (copy_null)
2273 target[len] = 0;
2274 return target;
2275}
2276
2277Py_UCS4*
2278PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2279 int copy_null)
2280{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002281 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002282 PyErr_BadInternalCall();
2283 return NULL;
2284 }
2285 return as_ucs4(string, target, targetsize, copy_null);
2286}
2287
2288Py_UCS4*
2289PyUnicode_AsUCS4Copy(PyObject *string)
2290{
2291 return as_ucs4(string, NULL, 0, 1);
2292}
2293
2294#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00002295
Alexander Belopolsky40018472011-02-26 01:02:56 +00002296PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02002297PyUnicode_FromWideChar(const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002298{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002299 if (w == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002300 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02002301 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson29060642009-01-31 22:14:21 +00002302 PyErr_BadInternalCall();
2303 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002304 }
2305
Martin v. Löwis790465f2008-04-05 20:41:37 +00002306 if (size == -1) {
2307 size = wcslen(w);
2308 }
2309
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002310 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002311}
2312
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002313#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00002314
Walter Dörwald346737f2007-05-31 10:44:43 +00002315static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002316makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
Victor Stinnere215d962012-10-06 23:03:36 +02002317 char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00002318{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002319 *fmt++ = '%';
Benjamin Peterson14339b62009-01-31 16:36:08 +00002320 if (longflag)
2321 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002322 else if (longlongflag) {
2323 /* longlongflag should only ever be nonzero on machines with
2324 HAVE_LONG_LONG defined */
2325#ifdef HAVE_LONG_LONG
2326 char *f = PY_FORMAT_LONG_LONG;
2327 while (*f)
2328 *fmt++ = *f++;
2329#else
2330 /* we shouldn't ever get here */
2331 assert(0);
2332 *fmt++ = 'l';
2333#endif
2334 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002335 else if (size_tflag) {
2336 char *f = PY_FORMAT_SIZE_T;
2337 while (*f)
2338 *fmt++ = *f++;
2339 }
2340 *fmt++ = c;
2341 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00002342}
2343
Victor Stinner15a11362012-10-06 23:48:20 +02002344/* maximum number of characters required for output of %lld or %p.
Victor Stinnere215d962012-10-06 23:03:36 +02002345 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2346 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2347#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
Victor Stinner96865452011-03-01 23:44:09 +00002348
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002349static int
2350unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2351 Py_ssize_t width, Py_ssize_t precision)
2352{
2353 Py_ssize_t length, fill, arglen;
2354 Py_UCS4 maxchar;
2355
2356 if (PyUnicode_READY(str) == -1)
2357 return -1;
2358
2359 length = PyUnicode_GET_LENGTH(str);
2360 if ((precision == -1 || precision >= length)
2361 && width <= length)
2362 return _PyUnicodeWriter_WriteStr(writer, str);
2363
2364 if (precision != -1)
2365 length = Py_MIN(precision, length);
2366
2367 arglen = Py_MAX(length, width);
2368 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2369 maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2370 else
2371 maxchar = writer->maxchar;
2372
2373 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2374 return -1;
2375
2376 if (width > length) {
2377 fill = width - length;
2378 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2379 return -1;
2380 writer->pos += fill;
2381 }
2382
2383 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2384 str, 0, length);
2385 writer->pos += length;
2386 return 0;
2387}
2388
2389static int
2390unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
2391 Py_ssize_t width, Py_ssize_t precision)
2392{
2393 /* UTF-8 */
2394 Py_ssize_t length;
2395 PyObject *unicode;
2396 int res;
2397
2398 length = strlen(str);
2399 if (precision != -1)
2400 length = Py_MIN(length, precision);
2401 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2402 if (unicode == NULL)
2403 return -1;
2404
2405 res = unicode_fromformat_write_str(writer, unicode, width, -1);
2406 Py_DECREF(unicode);
2407 return res;
2408}
2409
Victor Stinner96865452011-03-01 23:44:09 +00002410static const char*
Victor Stinnere215d962012-10-06 23:03:36 +02002411unicode_fromformat_arg(_PyUnicodeWriter *writer,
2412 const char *f, va_list *vargs)
Victor Stinner96865452011-03-01 23:44:09 +00002413{
Victor Stinnere215d962012-10-06 23:03:36 +02002414 const char *p;
2415 Py_ssize_t len;
2416 int zeropad;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002417 Py_ssize_t width;
2418 Py_ssize_t precision;
Victor Stinnere215d962012-10-06 23:03:36 +02002419 int longflag;
2420 int longlongflag;
2421 int size_tflag;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002422 Py_ssize_t fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002423
2424 p = f;
2425 f++;
Victor Stinner4c63a972012-10-06 23:55:33 +02002426 zeropad = 0;
2427 if (*f == '0') {
2428 zeropad = 1;
2429 f++;
2430 }
Victor Stinner96865452011-03-01 23:44:09 +00002431
2432 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002433 width = -1;
2434 if (Py_ISDIGIT((unsigned)*f)) {
2435 width = *f - '0';
Victor Stinner96865452011-03-01 23:44:09 +00002436 f++;
Victor Stinnere215d962012-10-06 23:03:36 +02002437 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002438 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
Victor Stinner3921e902012-10-06 23:05:00 +02002439 PyErr_SetString(PyExc_ValueError,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002440 "width too big");
Victor Stinner3921e902012-10-06 23:05:00 +02002441 return NULL;
2442 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002443 width = (width * 10) + (*f - '0');
Victor Stinnere215d962012-10-06 23:03:36 +02002444 f++;
2445 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002446 }
2447 precision = -1;
2448 if (*f == '.') {
2449 f++;
2450 if (Py_ISDIGIT((unsigned)*f)) {
2451 precision = (*f - '0');
2452 f++;
2453 while (Py_ISDIGIT((unsigned)*f)) {
2454 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2455 PyErr_SetString(PyExc_ValueError,
2456 "precision too big");
2457 return NULL;
2458 }
2459 precision = (precision * 10) + (*f - '0');
2460 f++;
2461 }
2462 }
Victor Stinner96865452011-03-01 23:44:09 +00002463 if (*f == '%') {
2464 /* "%.3%s" => f points to "3" */
2465 f--;
2466 }
2467 }
2468 if (*f == '\0') {
Victor Stinnere215d962012-10-06 23:03:36 +02002469 /* bogus format "%.123" => go backward, f points to "3" */
Victor Stinner96865452011-03-01 23:44:09 +00002470 f--;
2471 }
Victor Stinner96865452011-03-01 23:44:09 +00002472
2473 /* Handle %ld, %lu, %lld and %llu. */
2474 longflag = 0;
2475 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002476 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002477 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002478 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002479 longflag = 1;
2480 ++f;
2481 }
2482#ifdef HAVE_LONG_LONG
2483 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002484 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002485 longlongflag = 1;
2486 f += 2;
2487 }
2488#endif
2489 }
2490 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002491 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002492 size_tflag = 1;
2493 ++f;
2494 }
Victor Stinnere215d962012-10-06 23:03:36 +02002495
2496 if (f[1] == '\0')
2497 writer->overallocate = 0;
2498
2499 switch (*f) {
2500 case 'c':
2501 {
2502 int ordinal = va_arg(*vargs, int);
Victor Stinnerff5a8482012-10-06 23:05:45 +02002503 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Serhiy Storchakac89533f2013-06-23 20:21:16 +03002504 PyErr_SetString(PyExc_OverflowError,
Victor Stinnerff5a8482012-10-06 23:05:45 +02002505 "character argument not in range(0x110000)");
2506 return NULL;
2507 }
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002508 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002509 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002510 break;
2511 }
2512
2513 case 'i':
2514 case 'd':
2515 case 'u':
2516 case 'x':
2517 {
2518 /* used by sprintf */
2519 char fmt[10]; /* should be enough for "%0lld\0" */
Victor Stinner15a11362012-10-06 23:48:20 +02002520 char buffer[MAX_LONG_LONG_CHARS];
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002521 Py_ssize_t arglen;
Victor Stinnere215d962012-10-06 23:03:36 +02002522
2523 if (*f == 'u') {
2524 makefmt(fmt, longflag, longlongflag, size_tflag, *f);
2525
2526 if (longflag)
2527 len = sprintf(buffer, fmt,
2528 va_arg(*vargs, unsigned long));
2529#ifdef HAVE_LONG_LONG
2530 else if (longlongflag)
2531 len = sprintf(buffer, fmt,
2532 va_arg(*vargs, unsigned PY_LONG_LONG));
2533#endif
2534 else if (size_tflag)
2535 len = sprintf(buffer, fmt,
2536 va_arg(*vargs, size_t));
2537 else
2538 len = sprintf(buffer, fmt,
2539 va_arg(*vargs, unsigned int));
2540 }
2541 else if (*f == 'x') {
2542 makefmt(fmt, 0, 0, 0, 'x');
2543 len = sprintf(buffer, fmt, va_arg(*vargs, int));
2544 }
2545 else {
2546 makefmt(fmt, longflag, longlongflag, size_tflag, *f);
2547
2548 if (longflag)
2549 len = sprintf(buffer, fmt,
2550 va_arg(*vargs, long));
2551#ifdef HAVE_LONG_LONG
2552 else if (longlongflag)
2553 len = sprintf(buffer, fmt,
2554 va_arg(*vargs, PY_LONG_LONG));
2555#endif
2556 else if (size_tflag)
2557 len = sprintf(buffer, fmt,
2558 va_arg(*vargs, Py_ssize_t));
2559 else
2560 len = sprintf(buffer, fmt,
2561 va_arg(*vargs, int));
2562 }
2563 assert(len >= 0);
2564
Victor Stinnere215d962012-10-06 23:03:36 +02002565 if (precision < len)
2566 precision = len;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002567
2568 arglen = Py_MAX(precision, width);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002569 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2570 return NULL;
2571
Victor Stinnere215d962012-10-06 23:03:36 +02002572 if (width > precision) {
2573 Py_UCS4 fillchar;
2574 fill = width - precision;
2575 fillchar = zeropad?'0':' ';
Victor Stinner15a11362012-10-06 23:48:20 +02002576 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2577 return NULL;
2578 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002579 }
Victor Stinner15a11362012-10-06 23:48:20 +02002580 if (precision > len) {
Victor Stinnere215d962012-10-06 23:03:36 +02002581 fill = precision - len;
Victor Stinner15a11362012-10-06 23:48:20 +02002582 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2583 return NULL;
2584 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002585 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002586
Victor Stinner4a587072013-11-19 12:54:53 +01002587 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2588 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002589 break;
2590 }
2591
2592 case 'p':
2593 {
2594 char number[MAX_LONG_LONG_CHARS];
2595
2596 len = sprintf(number, "%p", va_arg(*vargs, void*));
2597 assert(len >= 0);
2598
2599 /* %p is ill-defined: ensure leading 0x. */
2600 if (number[1] == 'X')
2601 number[1] = 'x';
2602 else if (number[1] != 'x') {
2603 memmove(number + 2, number,
2604 strlen(number) + 1);
2605 number[0] = '0';
2606 number[1] = 'x';
2607 len += 2;
2608 }
2609
Victor Stinner4a587072013-11-19 12:54:53 +01002610 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002611 return NULL;
2612 break;
2613 }
2614
2615 case 's':
2616 {
2617 /* UTF-8 */
2618 const char *s = va_arg(*vargs, const char*);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002619 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002620 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002621 break;
2622 }
2623
2624 case 'U':
2625 {
2626 PyObject *obj = va_arg(*vargs, PyObject *);
2627 assert(obj && _PyUnicode_CHECK(obj));
2628
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002629 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002630 return NULL;
2631 break;
2632 }
2633
2634 case 'V':
2635 {
2636 PyObject *obj = va_arg(*vargs, PyObject *);
2637 const char *str = va_arg(*vargs, const char *);
Victor Stinnere215d962012-10-06 23:03:36 +02002638 if (obj) {
2639 assert(_PyUnicode_CHECK(obj));
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002640 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002641 return NULL;
2642 }
2643 else {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002644 assert(str != NULL);
2645 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002646 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002647 }
2648 break;
2649 }
2650
2651 case 'S':
2652 {
2653 PyObject *obj = va_arg(*vargs, PyObject *);
2654 PyObject *str;
2655 assert(obj);
2656 str = PyObject_Str(obj);
2657 if (!str)
2658 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002659 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002660 Py_DECREF(str);
2661 return NULL;
2662 }
2663 Py_DECREF(str);
2664 break;
2665 }
2666
2667 case 'R':
2668 {
2669 PyObject *obj = va_arg(*vargs, PyObject *);
2670 PyObject *repr;
2671 assert(obj);
2672 repr = PyObject_Repr(obj);
2673 if (!repr)
2674 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002675 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002676 Py_DECREF(repr);
2677 return NULL;
2678 }
2679 Py_DECREF(repr);
2680 break;
2681 }
2682
2683 case 'A':
2684 {
2685 PyObject *obj = va_arg(*vargs, PyObject *);
2686 PyObject *ascii;
2687 assert(obj);
2688 ascii = PyObject_ASCII(obj);
2689 if (!ascii)
2690 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002691 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002692 Py_DECREF(ascii);
2693 return NULL;
2694 }
2695 Py_DECREF(ascii);
2696 break;
2697 }
2698
2699 case '%':
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002700 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002701 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002702 break;
2703
2704 default:
2705 /* if we stumble upon an unknown formatting code, copy the rest
2706 of the format string to the output string. (we cannot just
2707 skip the code, since there's no way to know what's in the
2708 argument list) */
2709 len = strlen(p);
Victor Stinner4a587072013-11-19 12:54:53 +01002710 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002711 return NULL;
2712 f = p+len;
2713 return f;
2714 }
2715
2716 f++;
Victor Stinner96865452011-03-01 23:44:09 +00002717 return f;
2718}
2719
Walter Dörwaldd2034312007-05-18 16:29:38 +00002720PyObject *
2721PyUnicode_FromFormatV(const char *format, va_list vargs)
2722{
Victor Stinnere215d962012-10-06 23:03:36 +02002723 va_list vargs2;
2724 const char *f;
2725 _PyUnicodeWriter writer;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002726
Victor Stinner8f674cc2013-04-17 23:02:17 +02002727 _PyUnicodeWriter_Init(&writer);
2728 writer.min_length = strlen(format) + 100;
2729 writer.overallocate = 1;
Victor Stinnere215d962012-10-06 23:03:36 +02002730
2731 /* va_list may be an array (of 1 item) on some platforms (ex: AMD64).
2732 Copy it to be able to pass a reference to a subfunction. */
2733 Py_VA_COPY(vargs2, vargs);
2734
2735 for (f = format; *f; ) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002736 if (*f == '%') {
Victor Stinnere215d962012-10-06 23:03:36 +02002737 f = unicode_fromformat_arg(&writer, f, &vargs2);
2738 if (f == NULL)
2739 goto fail;
Victor Stinner1205f272010-09-11 00:54:47 +00002740 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002741 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002742 const char *p;
2743 Py_ssize_t len;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002744
Victor Stinnere215d962012-10-06 23:03:36 +02002745 p = f;
2746 do
2747 {
2748 if ((unsigned char)*p > 127) {
2749 PyErr_Format(PyExc_ValueError,
2750 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2751 "string, got a non-ASCII byte: 0x%02x",
2752 (unsigned char)*p);
2753 return NULL;
2754 }
2755 p++;
2756 }
2757 while (*p != '\0' && *p != '%');
2758 len = p - f;
2759
2760 if (*p == '\0')
2761 writer.overallocate = 0;
Victor Stinner4a587072013-11-19 12:54:53 +01002762
2763 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002764 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02002765
2766 f = p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002767 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002768 }
Victor Stinnere215d962012-10-06 23:03:36 +02002769 return _PyUnicodeWriter_Finish(&writer);
2770
2771 fail:
2772 _PyUnicodeWriter_Dealloc(&writer);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002773 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002774}
2775
Walter Dörwaldd2034312007-05-18 16:29:38 +00002776PyObject *
2777PyUnicode_FromFormat(const char *format, ...)
2778{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002779 PyObject* ret;
2780 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002781
2782#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002783 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002784#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002785 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002786#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002787 ret = PyUnicode_FromFormatV(format, vargs);
2788 va_end(vargs);
2789 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002790}
2791
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002792#ifdef HAVE_WCHAR_H
2793
Victor Stinner5593d8a2010-10-02 11:11:27 +00002794/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2795 convert a Unicode object to a wide character string.
2796
Victor Stinnerd88d9832011-09-06 02:00:05 +02002797 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002798 character) required to convert the unicode object. Ignore size argument.
2799
Victor Stinnerd88d9832011-09-06 02:00:05 +02002800 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002801 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002802 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002803static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002804unicode_aswidechar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002805 wchar_t *w,
2806 Py_ssize_t size)
2807{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002808 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002809 const wchar_t *wstr;
2810
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002811 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002812 if (wstr == NULL)
2813 return -1;
2814
Victor Stinner5593d8a2010-10-02 11:11:27 +00002815 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002816 if (size > res)
2817 size = res + 1;
2818 else
2819 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002820 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002821 return res;
2822 }
2823 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002824 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002825}
2826
2827Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002828PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002829 wchar_t *w,
2830 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002831{
2832 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002833 PyErr_BadInternalCall();
2834 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002835 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002836 return unicode_aswidechar(unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002837}
2838
Victor Stinner137c34c2010-09-29 10:25:54 +00002839wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002840PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002841 Py_ssize_t *size)
2842{
2843 wchar_t* buffer;
2844 Py_ssize_t buflen;
2845
2846 if (unicode == NULL) {
2847 PyErr_BadInternalCall();
2848 return NULL;
2849 }
2850
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002851 buflen = unicode_aswidechar(unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002852 if (buflen == -1)
2853 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002854 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002855 PyErr_NoMemory();
2856 return NULL;
2857 }
2858
Victor Stinner137c34c2010-09-29 10:25:54 +00002859 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2860 if (buffer == NULL) {
2861 PyErr_NoMemory();
2862 return NULL;
2863 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002864 buflen = unicode_aswidechar(unicode, buffer, buflen);
Stefan Krah8528c312012-08-19 21:52:43 +02002865 if (buflen == -1) {
2866 PyMem_FREE(buffer);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002867 return NULL;
Stefan Krah8528c312012-08-19 21:52:43 +02002868 }
Victor Stinner5593d8a2010-10-02 11:11:27 +00002869 if (size != NULL)
2870 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002871 return buffer;
2872}
2873
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002874#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002875
Alexander Belopolsky40018472011-02-26 01:02:56 +00002876PyObject *
2877PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002878{
Victor Stinner8faf8212011-12-08 22:14:11 +01002879 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002880 PyErr_SetString(PyExc_ValueError,
2881 "chr() arg not in range(0x110000)");
2882 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002883 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002884
Victor Stinner985a82a2014-01-03 12:53:47 +01002885 return unicode_char((Py_UCS4)ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002886}
2887
Alexander Belopolsky40018472011-02-26 01:02:56 +00002888PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02002889PyUnicode_FromObject(PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002890{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002891 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002892 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002893 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05002894 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002895 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002896 Py_INCREF(obj);
2897 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002898 }
2899 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002900 /* For a Unicode subtype that's not a Unicode object,
2901 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002902 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002903 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002904 PyErr_Format(PyExc_TypeError,
2905 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002906 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002907 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002908}
2909
Alexander Belopolsky40018472011-02-26 01:02:56 +00002910PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02002911PyUnicode_FromEncodedObject(PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002912 const char *encoding,
2913 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002914{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002915 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002916 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002917
Guido van Rossumd57fd912000-03-10 22:53:23 +00002918 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002919 PyErr_BadInternalCall();
2920 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002921 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002922
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002923 /* Decoding bytes objects is the most common case and should be fast */
2924 if (PyBytes_Check(obj)) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02002925 if (PyBytes_GET_SIZE(obj) == 0)
2926 _Py_RETURN_UNICODE_EMPTY();
2927 v = PyUnicode_Decode(
2928 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2929 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002930 return v;
2931 }
2932
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002933 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002934 PyErr_SetString(PyExc_TypeError,
2935 "decoding str is not supported");
2936 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002937 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002938
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002939 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2940 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2941 PyErr_Format(PyExc_TypeError,
2942 "coercing to str: need bytes, bytearray "
2943 "or buffer-like object, %.80s found",
2944 Py_TYPE(obj)->tp_name);
2945 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002946 }
Tim Petersced69f82003-09-16 20:30:58 +00002947
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002948 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02002949 PyBuffer_Release(&buffer);
2950 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00002951 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002952
Serhiy Storchaka05997252013-01-26 12:14:02 +02002953 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002954 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002955 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002956}
2957
Victor Stinner600d3be2010-06-10 12:00:55 +00002958/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002959 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2960 1 on success. */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01002961int
2962_Py_normalize_encoding(const char *encoding,
2963 char *lower,
2964 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002965{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002966 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002967 char *l;
2968 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002969
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04002970 if (encoding == NULL) {
Victor Stinner66b32702013-11-07 23:12:23 +01002971 /* 6 == strlen("utf-8") + 1 */
Victor Stinnerdf23e302013-11-07 13:33:36 +01002972 if (lower_len < 6)
2973 return 0;
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04002974 strcpy(lower, "utf-8");
2975 return 1;
2976 }
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002977 e = encoding;
2978 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002979 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002980 while (*e) {
2981 if (l == l_end)
2982 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002983 if (Py_ISUPPER(*e)) {
2984 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002985 }
2986 else if (*e == '_') {
2987 *l++ = '-';
2988 e++;
2989 }
2990 else {
2991 *l++ = *e++;
2992 }
2993 }
2994 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002995 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002996}
2997
Alexander Belopolsky40018472011-02-26 01:02:56 +00002998PyObject *
2999PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003000 Py_ssize_t size,
3001 const char *encoding,
3002 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00003003{
3004 PyObject *buffer = NULL, *unicode;
3005 Py_buffer info;
3006 char lower[11]; /* Enough for any encoding shortcut */
3007
Fred Drakee4315f52000-05-09 19:53:39 +00003008 /* Shortcuts for common default encodings */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003009 if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003010 if ((strcmp(lower, "utf-8") == 0) ||
3011 (strcmp(lower, "utf8") == 0))
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003012 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
Victor Stinner37296e82010-06-10 13:36:23 +00003013 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003014 (strcmp(lower, "latin1") == 0) ||
Victor Stinnerfa3ba4c2013-10-29 11:34:05 +01003015 (strcmp(lower, "iso-8859-1") == 0) ||
3016 (strcmp(lower, "iso8859-1") == 0))
Victor Stinner37296e82010-06-10 13:36:23 +00003017 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003018#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00003019 else if (strcmp(lower, "mbcs") == 0)
3020 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003021#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003022 else if (strcmp(lower, "ascii") == 0)
3023 return PyUnicode_DecodeASCII(s, size, errors);
3024 else if (strcmp(lower, "utf-16") == 0)
3025 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3026 else if (strcmp(lower, "utf-32") == 0)
3027 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3028 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003029
3030 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003031 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00003032 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003033 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003034 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003035 if (buffer == NULL)
3036 goto onError;
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003037 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003038 if (unicode == NULL)
3039 goto onError;
3040 if (!PyUnicode_Check(unicode)) {
3041 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003042 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3043 "use codecs.decode() to decode to arbitrary types",
3044 encoding,
3045 Py_TYPE(unicode)->tp_name, Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003046 Py_DECREF(unicode);
3047 goto onError;
3048 }
3049 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003050 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003051
Benjamin Peterson29060642009-01-31 22:14:21 +00003052 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003053 Py_XDECREF(buffer);
3054 return NULL;
3055}
3056
Alexander Belopolsky40018472011-02-26 01:02:56 +00003057PyObject *
3058PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003059 const char *encoding,
3060 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003061{
3062 PyObject *v;
3063
3064 if (!PyUnicode_Check(unicode)) {
3065 PyErr_BadArgument();
3066 goto onError;
3067 }
3068
3069 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003070 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003071
3072 /* Decode via the codec registry */
3073 v = PyCodec_Decode(unicode, encoding, errors);
3074 if (v == NULL)
3075 goto onError;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003076 return unicode_result(v);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003077
Benjamin Peterson29060642009-01-31 22:14:21 +00003078 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003079 return NULL;
3080}
3081
Alexander Belopolsky40018472011-02-26 01:02:56 +00003082PyObject *
3083PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003084 const char *encoding,
3085 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003086{
3087 PyObject *v;
3088
3089 if (!PyUnicode_Check(unicode)) {
3090 PyErr_BadArgument();
3091 goto onError;
3092 }
3093
3094 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003095 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003096
3097 /* Decode via the codec registry */
3098 v = PyCodec_Decode(unicode, encoding, errors);
3099 if (v == NULL)
3100 goto onError;
3101 if (!PyUnicode_Check(v)) {
3102 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003103 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3104 "use codecs.decode() to decode to arbitrary types",
3105 encoding,
3106 Py_TYPE(unicode)->tp_name, Py_TYPE(unicode)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003107 Py_DECREF(v);
3108 goto onError;
3109 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003110 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003111
Benjamin Peterson29060642009-01-31 22:14:21 +00003112 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003113 return NULL;
3114}
3115
Alexander Belopolsky40018472011-02-26 01:02:56 +00003116PyObject *
3117PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003118 Py_ssize_t size,
3119 const char *encoding,
3120 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003121{
3122 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003123
Guido van Rossumd57fd912000-03-10 22:53:23 +00003124 unicode = PyUnicode_FromUnicode(s, size);
3125 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003126 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003127 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3128 Py_DECREF(unicode);
3129 return v;
3130}
3131
Alexander Belopolsky40018472011-02-26 01:02:56 +00003132PyObject *
3133PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003134 const char *encoding,
3135 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003136{
3137 PyObject *v;
3138
3139 if (!PyUnicode_Check(unicode)) {
3140 PyErr_BadArgument();
3141 goto onError;
3142 }
3143
3144 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003145 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003146
3147 /* Encode via the codec registry */
3148 v = PyCodec_Encode(unicode, encoding, errors);
3149 if (v == NULL)
3150 goto onError;
3151 return v;
3152
Benjamin Peterson29060642009-01-31 22:14:21 +00003153 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003154 return NULL;
3155}
3156
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003157static size_t
3158wcstombs_errorpos(const wchar_t *wstr)
3159{
3160 size_t len;
3161#if SIZEOF_WCHAR_T == 2
3162 wchar_t buf[3];
3163#else
3164 wchar_t buf[2];
3165#endif
3166 char outbuf[MB_LEN_MAX];
3167 const wchar_t *start, *previous;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003168
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003169#if SIZEOF_WCHAR_T == 2
3170 buf[2] = 0;
3171#else
3172 buf[1] = 0;
3173#endif
3174 start = wstr;
3175 while (*wstr != L'\0')
3176 {
3177 previous = wstr;
3178#if SIZEOF_WCHAR_T == 2
3179 if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
3180 && Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
3181 {
3182 buf[0] = wstr[0];
3183 buf[1] = wstr[1];
3184 wstr += 2;
3185 }
3186 else {
3187 buf[0] = *wstr;
3188 buf[1] = 0;
3189 wstr++;
3190 }
3191#else
3192 buf[0] = *wstr;
3193 wstr++;
3194#endif
3195 len = wcstombs(outbuf, buf, sizeof(outbuf));
Victor Stinner2f197072011-12-17 07:08:30 +01003196 if (len == (size_t)-1)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003197 return previous - start;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003198 }
3199
3200 /* failed to find the unencodable character */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003201 return 0;
3202}
3203
Victor Stinner1b579672011-12-17 05:47:23 +01003204static int
3205locale_error_handler(const char *errors, int *surrogateescape)
3206{
3207 if (errors == NULL) {
3208 *surrogateescape = 0;
3209 return 0;
3210 }
3211
3212 if (strcmp(errors, "strict") == 0) {
3213 *surrogateescape = 0;
3214 return 0;
3215 }
Victor Stinner8dbd4212012-12-04 09:30:24 +01003216 if (strcmp(errors, "surrogateescape") == 0) {
Victor Stinner1b579672011-12-17 05:47:23 +01003217 *surrogateescape = 1;
3218 return 0;
3219 }
3220 PyErr_Format(PyExc_ValueError,
3221 "only 'strict' and 'surrogateescape' error handlers "
3222 "are supported, not '%s'",
3223 errors);
3224 return -1;
3225}
3226
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003227PyObject *
Victor Stinner1b579672011-12-17 05:47:23 +01003228PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003229{
3230 Py_ssize_t wlen, wlen2;
3231 wchar_t *wstr;
3232 PyObject *bytes = NULL;
3233 char *errmsg;
Raymond Hettingere56666d2013-08-04 11:51:03 -07003234 PyObject *reason = NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003235 PyObject *exc;
3236 size_t error_pos;
Victor Stinner1b579672011-12-17 05:47:23 +01003237 int surrogateescape;
3238
3239 if (locale_error_handler(errors, &surrogateescape) < 0)
3240 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003241
3242 wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3243 if (wstr == NULL)
3244 return NULL;
3245
3246 wlen2 = wcslen(wstr);
3247 if (wlen2 != wlen) {
3248 PyMem_Free(wstr);
3249 PyErr_SetString(PyExc_TypeError, "embedded null character");
3250 return NULL;
3251 }
3252
3253 if (surrogateescape) {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003254 /* "surrogateescape" error handler */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003255 char *str;
3256
3257 str = _Py_wchar2char(wstr, &error_pos);
3258 if (str == NULL) {
3259 if (error_pos == (size_t)-1) {
3260 PyErr_NoMemory();
3261 PyMem_Free(wstr);
3262 return NULL;
3263 }
3264 else {
3265 goto encode_error;
3266 }
3267 }
3268 PyMem_Free(wstr);
3269
3270 bytes = PyBytes_FromString(str);
3271 PyMem_Free(str);
3272 }
3273 else {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003274 /* strict mode */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003275 size_t len, len2;
3276
3277 len = wcstombs(NULL, wstr, 0);
3278 if (len == (size_t)-1) {
Victor Stinner2f197072011-12-17 07:08:30 +01003279 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003280 goto encode_error;
3281 }
3282
3283 bytes = PyBytes_FromStringAndSize(NULL, len);
3284 if (bytes == NULL) {
3285 PyMem_Free(wstr);
3286 return NULL;
3287 }
3288
3289 len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
3290 if (len2 == (size_t)-1 || len2 > len) {
Victor Stinner2f197072011-12-17 07:08:30 +01003291 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003292 goto encode_error;
3293 }
3294 PyMem_Free(wstr);
3295 }
3296 return bytes;
3297
3298encode_error:
3299 errmsg = strerror(errno);
3300 assert(errmsg != NULL);
Victor Stinner2f197072011-12-17 07:08:30 +01003301
3302 if (error_pos == (size_t)-1)
3303 error_pos = wcstombs_errorpos(wstr);
3304
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003305 PyMem_Free(wstr);
3306 Py_XDECREF(bytes);
3307
Victor Stinner2f197072011-12-17 07:08:30 +01003308 if (errmsg != NULL) {
3309 size_t errlen;
3310 wstr = _Py_char2wchar(errmsg, &errlen);
3311 if (wstr != NULL) {
3312 reason = PyUnicode_FromWideChar(wstr, errlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003313 PyMem_RawFree(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003314 } else
3315 errmsg = NULL;
3316 }
3317 if (errmsg == NULL)
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003318 reason = PyUnicode_FromString(
3319 "wcstombs() encountered an unencodable "
3320 "wide character");
3321 if (reason == NULL)
3322 return NULL;
3323
3324 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO",
3325 "locale", unicode,
3326 (Py_ssize_t)error_pos,
3327 (Py_ssize_t)(error_pos+1),
3328 reason);
3329 Py_DECREF(reason);
3330 if (exc != NULL) {
3331 PyCodec_StrictErrors(exc);
3332 Py_XDECREF(exc);
3333 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003334 return NULL;
3335}
3336
Victor Stinnerad158722010-10-27 00:25:46 +00003337PyObject *
3338PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003339{
Victor Stinner99b95382011-07-04 14:23:54 +02003340#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003341 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003342#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003343 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00003344#else
Victor Stinner793b5312011-04-27 00:24:21 +02003345 PyInterpreterState *interp = PyThreadState_GET()->interp;
3346 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3347 cannot use it to encode and decode filenames before it is loaded. Load
3348 the Python codec requires to encode at least its own filename. Use the C
3349 version of the locale codec until the codec registry is initialized and
3350 the Python codec is loaded.
3351
3352 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3353 cannot only rely on it: check also interp->fscodec_initialized for
3354 subinterpreters. */
3355 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003356 return PyUnicode_AsEncodedString(unicode,
3357 Py_FileSystemDefaultEncoding,
3358 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003359 }
3360 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003361 return PyUnicode_EncodeLocale(unicode, "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003362 }
Victor Stinnerad158722010-10-27 00:25:46 +00003363#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003364}
3365
Alexander Belopolsky40018472011-02-26 01:02:56 +00003366PyObject *
3367PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003368 const char *encoding,
3369 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003370{
3371 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00003372 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003373
Guido van Rossumd57fd912000-03-10 22:53:23 +00003374 if (!PyUnicode_Check(unicode)) {
3375 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003376 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003377 }
Fred Drakee4315f52000-05-09 19:53:39 +00003378
Fred Drakee4315f52000-05-09 19:53:39 +00003379 /* Shortcuts for common default encodings */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003380 if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003381 if ((strcmp(lower, "utf-8") == 0) ||
3382 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00003383 {
Victor Stinner2f283c22011-03-02 01:21:46 +00003384 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003385 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00003386 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003387 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00003388 }
Victor Stinner37296e82010-06-10 13:36:23 +00003389 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003390 (strcmp(lower, "latin1") == 0) ||
Victor Stinnerfa3ba4c2013-10-29 11:34:05 +01003391 (strcmp(lower, "iso-8859-1") == 0) ||
3392 (strcmp(lower, "iso8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003393 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003394#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003395 else if (strcmp(lower, "mbcs") == 0)
3396 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003397#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003398 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003399 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00003400 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003401
3402 /* Encode via the codec registry */
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003403 v = _PyCodec_EncodeText(unicode, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003404 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003405 return NULL;
3406
3407 /* The normal path */
3408 if (PyBytes_Check(v))
3409 return v;
3410
3411 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003412 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003413 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003414 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003415
3416 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003417 "encoder %s returned bytearray instead of bytes; "
3418 "use codecs.encode() to encode to arbitrary types",
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003419 encoding);
3420 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003421 Py_DECREF(v);
3422 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003423 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003424
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003425 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3426 Py_DECREF(v);
3427 return b;
3428 }
3429
3430 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003431 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
3432 "use codecs.encode() to encode to arbitrary types",
3433 encoding,
3434 Py_TYPE(v)->tp_name, Py_TYPE(v)->tp_name);
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003435 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003436 return NULL;
3437}
3438
Alexander Belopolsky40018472011-02-26 01:02:56 +00003439PyObject *
3440PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003441 const char *encoding,
3442 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003443{
3444 PyObject *v;
3445
3446 if (!PyUnicode_Check(unicode)) {
3447 PyErr_BadArgument();
3448 goto onError;
3449 }
3450
3451 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003452 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003453
3454 /* Encode via the codec registry */
3455 v = PyCodec_Encode(unicode, encoding, errors);
3456 if (v == NULL)
3457 goto onError;
3458 if (!PyUnicode_Check(v)) {
3459 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003460 "'%.400s' encoder returned '%.400s' instead of 'str'; "
3461 "use codecs.encode() to encode to arbitrary types",
3462 encoding,
3463 Py_TYPE(v)->tp_name, Py_TYPE(v)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003464 Py_DECREF(v);
3465 goto onError;
3466 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003467 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003468
Benjamin Peterson29060642009-01-31 22:14:21 +00003469 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003470 return NULL;
3471}
3472
Victor Stinner2f197072011-12-17 07:08:30 +01003473static size_t
3474mbstowcs_errorpos(const char *str, size_t len)
3475{
3476#ifdef HAVE_MBRTOWC
3477 const char *start = str;
3478 mbstate_t mbs;
3479 size_t converted;
3480 wchar_t ch;
3481
3482 memset(&mbs, 0, sizeof mbs);
3483 while (len)
3484 {
3485 converted = mbrtowc(&ch, (char*)str, len, &mbs);
3486 if (converted == 0)
3487 /* Reached end of string */
3488 break;
3489 if (converted == (size_t)-1 || converted == (size_t)-2) {
3490 /* Conversion error or incomplete character */
3491 return str - start;
3492 }
3493 else {
3494 str += converted;
3495 len -= converted;
3496 }
3497 }
3498 /* failed to find the undecodable byte sequence */
3499 return 0;
3500#endif
3501 return 0;
3502}
3503
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003504PyObject*
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003505PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
Victor Stinner1b579672011-12-17 05:47:23 +01003506 const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003507{
3508 wchar_t smallbuf[256];
3509 size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
3510 wchar_t *wstr;
3511 size_t wlen, wlen2;
3512 PyObject *unicode;
Victor Stinner1b579672011-12-17 05:47:23 +01003513 int surrogateescape;
Victor Stinner2f197072011-12-17 07:08:30 +01003514 size_t error_pos;
3515 char *errmsg;
3516 PyObject *reason, *exc;
Victor Stinner1b579672011-12-17 05:47:23 +01003517
3518 if (locale_error_handler(errors, &surrogateescape) < 0)
3519 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003520
3521 if (str[len] != '\0' || len != strlen(str)) {
3522 PyErr_SetString(PyExc_TypeError, "embedded null character");
3523 return NULL;
3524 }
3525
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003526 if (surrogateescape) {
3527 /* "surrogateescape" error handler */
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003528 wstr = _Py_char2wchar(str, &wlen);
3529 if (wstr == NULL) {
3530 if (wlen == (size_t)-1)
3531 PyErr_NoMemory();
3532 else
3533 PyErr_SetFromErrno(PyExc_OSError);
3534 return NULL;
3535 }
3536
3537 unicode = PyUnicode_FromWideChar(wstr, wlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003538 PyMem_RawFree(wstr);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003539 }
3540 else {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003541 /* strict mode */
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003542#ifndef HAVE_BROKEN_MBSTOWCS
3543 wlen = mbstowcs(NULL, str, 0);
3544#else
3545 wlen = len;
3546#endif
Victor Stinner2f197072011-12-17 07:08:30 +01003547 if (wlen == (size_t)-1)
3548 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003549 if (wlen+1 <= smallbuf_len) {
3550 wstr = smallbuf;
3551 }
3552 else {
3553 if (wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1)
3554 return PyErr_NoMemory();
3555
3556 wstr = PyMem_Malloc((wlen+1) * sizeof(wchar_t));
3557 if (!wstr)
3558 return PyErr_NoMemory();
3559 }
3560
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003561 wlen2 = mbstowcs(wstr, str, wlen+1);
3562 if (wlen2 == (size_t)-1) {
3563 if (wstr != smallbuf)
3564 PyMem_Free(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003565 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003566 }
3567#ifdef HAVE_BROKEN_MBSTOWCS
3568 assert(wlen2 == wlen);
3569#endif
3570 unicode = PyUnicode_FromWideChar(wstr, wlen2);
3571 if (wstr != smallbuf)
3572 PyMem_Free(wstr);
3573 }
3574 return unicode;
Victor Stinner2f197072011-12-17 07:08:30 +01003575
3576decode_error:
3577 errmsg = strerror(errno);
3578 assert(errmsg != NULL);
3579
3580 error_pos = mbstowcs_errorpos(str, len);
3581 if (errmsg != NULL) {
3582 size_t errlen;
3583 wstr = _Py_char2wchar(errmsg, &errlen);
3584 if (wstr != NULL) {
3585 reason = PyUnicode_FromWideChar(wstr, errlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003586 PyMem_RawFree(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003587 } else
3588 errmsg = NULL;
3589 }
3590 if (errmsg == NULL)
3591 reason = PyUnicode_FromString(
3592 "mbstowcs() encountered an invalid multibyte sequence");
3593 if (reason == NULL)
3594 return NULL;
3595
3596 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO",
3597 "locale", str, len,
3598 (Py_ssize_t)error_pos,
3599 (Py_ssize_t)(error_pos+1),
3600 reason);
3601 Py_DECREF(reason);
3602 if (exc != NULL) {
3603 PyCodec_StrictErrors(exc);
3604 Py_XDECREF(exc);
3605 }
3606 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003607}
3608
3609PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003610PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003611{
3612 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner1b579672011-12-17 05:47:23 +01003613 return PyUnicode_DecodeLocaleAndSize(str, size, errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003614}
3615
3616
3617PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003618PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003619 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003620 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3621}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003622
Christian Heimes5894ba72007-11-04 11:43:14 +00003623PyObject*
3624PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3625{
Victor Stinner99b95382011-07-04 14:23:54 +02003626#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003627 return PyUnicode_DecodeMBCS(s, size, NULL);
3628#elif defined(__APPLE__)
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003629 return PyUnicode_DecodeUTF8Stateful(s, size, "surrogateescape", NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003630#else
Victor Stinner793b5312011-04-27 00:24:21 +02003631 PyInterpreterState *interp = PyThreadState_GET()->interp;
3632 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3633 cannot use it to encode and decode filenames before it is loaded. Load
3634 the Python codec requires to encode at least its own filename. Use the C
3635 version of the locale codec until the codec registry is initialized and
3636 the Python codec is loaded.
3637
3638 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3639 cannot only rely on it: check also interp->fscodec_initialized for
3640 subinterpreters. */
3641 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003642 return PyUnicode_Decode(s, size,
3643 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00003644 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003645 }
3646 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003647 return PyUnicode_DecodeLocaleAndSize(s, size, "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003648 }
Victor Stinnerad158722010-10-27 00:25:46 +00003649#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003650}
3651
Martin v. Löwis011e8422009-05-05 04:43:17 +00003652
3653int
Victor Stinnerfe75fb42012-10-23 02:52:18 +02003654_PyUnicode_HasNULChars(PyObject* str)
Antoine Pitrou13348842012-01-29 18:36:34 +01003655{
Victor Stinnerfe75fb42012-10-23 02:52:18 +02003656 Py_ssize_t pos;
Antoine Pitrou13348842012-01-29 18:36:34 +01003657
Victor Stinnerfe75fb42012-10-23 02:52:18 +02003658 if (PyUnicode_READY(str) == -1)
Antoine Pitrou13348842012-01-29 18:36:34 +01003659 return -1;
Victor Stinnerfe75fb42012-10-23 02:52:18 +02003660 pos = findchar(PyUnicode_DATA(str), PyUnicode_KIND(str),
3661 PyUnicode_GET_LENGTH(str), '\0', 1);
3662 if (pos == -1)
3663 return 0;
3664 else
3665 return 1;
Antoine Pitrou13348842012-01-29 18:36:34 +01003666}
3667
Antoine Pitrou13348842012-01-29 18:36:34 +01003668int
Martin v. Löwis011e8422009-05-05 04:43:17 +00003669PyUnicode_FSConverter(PyObject* arg, void* addr)
3670{
3671 PyObject *output = NULL;
3672 Py_ssize_t size;
3673 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003674 if (arg == NULL) {
3675 Py_DECREF(*(PyObject**)addr);
3676 return 1;
3677 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00003678 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00003679 output = arg;
3680 Py_INCREF(output);
3681 }
3682 else {
3683 arg = PyUnicode_FromObject(arg);
3684 if (!arg)
3685 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00003686 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003687 Py_DECREF(arg);
3688 if (!output)
3689 return 0;
3690 if (!PyBytes_Check(output)) {
3691 Py_DECREF(output);
3692 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3693 return 0;
3694 }
3695 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00003696 size = PyBytes_GET_SIZE(output);
3697 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003698 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05003699 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003700 Py_DECREF(output);
3701 return 0;
3702 }
3703 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003704 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003705}
3706
3707
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003708int
3709PyUnicode_FSDecoder(PyObject* arg, void* addr)
3710{
3711 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003712 if (arg == NULL) {
3713 Py_DECREF(*(PyObject**)addr);
3714 return 1;
3715 }
3716 if (PyUnicode_Check(arg)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003717 if (PyUnicode_READY(arg) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003718 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003719 output = arg;
3720 Py_INCREF(output);
3721 }
3722 else {
3723 arg = PyBytes_FromObject(arg);
3724 if (!arg)
3725 return 0;
3726 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3727 PyBytes_GET_SIZE(arg));
3728 Py_DECREF(arg);
3729 if (!output)
3730 return 0;
3731 if (!PyUnicode_Check(output)) {
3732 Py_DECREF(output);
3733 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3734 return 0;
3735 }
3736 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003737 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003738 Py_DECREF(output);
3739 return 0;
3740 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003741 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003742 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003743 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3744 Py_DECREF(output);
3745 return 0;
3746 }
3747 *(PyObject**)addr = output;
3748 return Py_CLEANUP_SUPPORTED;
3749}
3750
3751
Martin v. Löwis5b222132007-06-10 09:51:05 +00003752char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003753PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003754{
Christian Heimesf3863112007-11-22 07:46:41 +00003755 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003756
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003757 if (!PyUnicode_Check(unicode)) {
3758 PyErr_BadArgument();
3759 return NULL;
3760 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003761 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003762 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003763
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003764 if (PyUnicode_UTF8(unicode) == NULL) {
3765 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003766 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3767 if (bytes == NULL)
3768 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003769 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3770 if (_PyUnicode_UTF8(unicode) == NULL) {
Victor Stinnera5afb582013-10-29 01:28:23 +01003771 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003772 Py_DECREF(bytes);
3773 return NULL;
3774 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003775 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3776 Py_MEMCPY(_PyUnicode_UTF8(unicode),
3777 PyBytes_AS_STRING(bytes),
3778 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003779 Py_DECREF(bytes);
3780 }
3781
3782 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003783 *psize = PyUnicode_UTF8_LENGTH(unicode);
3784 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003785}
3786
3787char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003788PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003789{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003790 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3791}
3792
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003793Py_UNICODE *
3794PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3795{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003796 const unsigned char *one_byte;
3797#if SIZEOF_WCHAR_T == 4
3798 const Py_UCS2 *two_bytes;
3799#else
3800 const Py_UCS4 *four_bytes;
3801 const Py_UCS4 *ucs4_end;
3802 Py_ssize_t num_surrogates;
3803#endif
3804 wchar_t *w;
3805 wchar_t *wchar_end;
3806
3807 if (!PyUnicode_Check(unicode)) {
3808 PyErr_BadArgument();
3809 return NULL;
3810 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003811 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003812 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003813 assert(_PyUnicode_KIND(unicode) != 0);
3814 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003815
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003816 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003817#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003818 four_bytes = PyUnicode_4BYTE_DATA(unicode);
3819 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003820 num_surrogates = 0;
3821
3822 for (; four_bytes < ucs4_end; ++four_bytes) {
3823 if (*four_bytes > 0xFFFF)
3824 ++num_surrogates;
3825 }
3826
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003827 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
3828 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
3829 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003830 PyErr_NoMemory();
3831 return NULL;
3832 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003833 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003834
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003835 w = _PyUnicode_WSTR(unicode);
3836 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
3837 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003838 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3839 if (*four_bytes > 0xFFFF) {
Victor Stinner8faf8212011-12-08 22:14:11 +01003840 assert(*four_bytes <= MAX_UNICODE);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003841 /* encode surrogate pair in this case */
Victor Stinner551ac952011-11-29 22:58:13 +01003842 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
3843 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003844 }
3845 else
3846 *w = *four_bytes;
3847
3848 if (w > wchar_end) {
3849 assert(0 && "Miscalculated string end");
3850 }
3851 }
3852 *w = 0;
3853#else
3854 /* sizeof(wchar_t) == 4 */
3855 Py_FatalError("Impossible unicode object state, wstr and str "
3856 "should share memory already.");
3857 return NULL;
3858#endif
3859 }
3860 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003861 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3862 (_PyUnicode_LENGTH(unicode) + 1));
3863 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003864 PyErr_NoMemory();
3865 return NULL;
3866 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003867 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
3868 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
3869 w = _PyUnicode_WSTR(unicode);
3870 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003871
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003872 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3873 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003874 for (; w < wchar_end; ++one_byte, ++w)
3875 *w = *one_byte;
3876 /* null-terminate the wstr */
3877 *w = 0;
3878 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003879 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003880#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003881 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003882 for (; w < wchar_end; ++two_bytes, ++w)
3883 *w = *two_bytes;
3884 /* null-terminate the wstr */
3885 *w = 0;
3886#else
3887 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003888 PyObject_FREE(_PyUnicode_WSTR(unicode));
3889 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003890 Py_FatalError("Impossible unicode object state, wstr "
3891 "and str should share memory already.");
3892 return NULL;
3893#endif
3894 }
3895 else {
3896 assert(0 && "This should never happen.");
3897 }
3898 }
3899 }
3900 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003901 *size = PyUnicode_WSTR_LENGTH(unicode);
3902 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003903}
3904
Alexander Belopolsky40018472011-02-26 01:02:56 +00003905Py_UNICODE *
3906PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003907{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003908 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003909}
3910
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003911
Alexander Belopolsky40018472011-02-26 01:02:56 +00003912Py_ssize_t
3913PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003914{
3915 if (!PyUnicode_Check(unicode)) {
3916 PyErr_BadArgument();
3917 goto onError;
3918 }
3919 return PyUnicode_GET_SIZE(unicode);
3920
Benjamin Peterson29060642009-01-31 22:14:21 +00003921 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003922 return -1;
3923}
3924
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003925Py_ssize_t
3926PyUnicode_GetLength(PyObject *unicode)
3927{
Victor Stinner07621332012-06-16 04:53:46 +02003928 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003929 PyErr_BadArgument();
3930 return -1;
3931 }
Victor Stinner07621332012-06-16 04:53:46 +02003932 if (PyUnicode_READY(unicode) == -1)
3933 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003934 return PyUnicode_GET_LENGTH(unicode);
3935}
3936
3937Py_UCS4
3938PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3939{
Victor Stinner69ed0f42013-04-09 21:48:24 +02003940 void *data;
3941 int kind;
3942
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003943 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3944 PyErr_BadArgument();
3945 return (Py_UCS4)-1;
3946 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01003947 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003948 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003949 return (Py_UCS4)-1;
3950 }
Victor Stinner69ed0f42013-04-09 21:48:24 +02003951 data = PyUnicode_DATA(unicode);
3952 kind = PyUnicode_KIND(unicode);
3953 return PyUnicode_READ(kind, data, index);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003954}
3955
3956int
3957PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3958{
3959 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003960 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003961 return -1;
3962 }
Victor Stinner488fa492011-12-12 00:01:39 +01003963 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01003964 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003965 PyErr_SetString(PyExc_IndexError, "string index out of range");
3966 return -1;
3967 }
Victor Stinner488fa492011-12-12 00:01:39 +01003968 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02003969 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01003970 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
3971 PyErr_SetString(PyExc_ValueError, "character out of range");
3972 return -1;
3973 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003974 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3975 index, ch);
3976 return 0;
3977}
3978
Alexander Belopolsky40018472011-02-26 01:02:56 +00003979const char *
3980PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00003981{
Victor Stinner42cb4622010-09-01 19:39:01 +00003982 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00003983}
3984
Victor Stinner554f3f02010-06-16 23:33:54 +00003985/* create or adjust a UnicodeDecodeError */
3986static void
3987make_decode_exception(PyObject **exceptionObject,
3988 const char *encoding,
3989 const char *input, Py_ssize_t length,
3990 Py_ssize_t startpos, Py_ssize_t endpos,
3991 const char *reason)
3992{
3993 if (*exceptionObject == NULL) {
3994 *exceptionObject = PyUnicodeDecodeError_Create(
3995 encoding, input, length, startpos, endpos, reason);
3996 }
3997 else {
3998 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
3999 goto onError;
4000 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4001 goto onError;
4002 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4003 goto onError;
4004 }
4005 return;
4006
4007onError:
4008 Py_DECREF(*exceptionObject);
4009 *exceptionObject = NULL;
4010}
4011
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004012#ifdef HAVE_MBCS
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004013/* error handling callback helper:
4014 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00004015 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004016 and adjust various state variables.
4017 return 0 on success, -1 on error
4018*/
4019
Alexander Belopolsky40018472011-02-26 01:02:56 +00004020static int
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004021unicode_decode_call_errorhandler_wchar(
4022 const char *errors, PyObject **errorHandler,
4023 const char *encoding, const char *reason,
4024 const char **input, const char **inend, Py_ssize_t *startinpos,
4025 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4026 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004027{
Benjamin Peterson142957c2008-07-04 19:55:29 +00004028 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004029
4030 PyObject *restuple = NULL;
4031 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01004032 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004033 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004034 Py_ssize_t requiredsize;
4035 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004036 PyObject *inputobj = NULL;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004037 wchar_t *repwstr;
4038 Py_ssize_t repwlen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004039
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004040 assert (_PyUnicode_KIND(*output) == PyUnicode_WCHAR_KIND);
4041 outsize = _PyUnicode_WSTR_LENGTH(*output);
Victor Stinner596a6c42011-11-09 00:02:18 +01004042
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004043 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004044 *errorHandler = PyCodec_LookupError(errors);
4045 if (*errorHandler == NULL)
4046 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004047 }
4048
Victor Stinner554f3f02010-06-16 23:33:54 +00004049 make_decode_exception(exceptionObject,
4050 encoding,
4051 *input, *inend - *input,
4052 *startinpos, *endinpos,
4053 reason);
4054 if (*exceptionObject == NULL)
4055 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004056
4057 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4058 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004059 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004060 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00004061 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004062 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004063 }
4064 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004065 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004066
4067 /* Copy back the bytes variables, which might have been modified by the
4068 callback */
4069 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4070 if (!inputobj)
4071 goto onError;
4072 if (!PyBytes_Check(inputobj)) {
4073 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
4074 }
4075 *input = PyBytes_AS_STRING(inputobj);
4076 insize = PyBytes_GET_SIZE(inputobj);
4077 *inend = *input + insize;
4078 /* we can DECREF safely, as the exception has another reference,
4079 so the object won't go away. */
4080 Py_DECREF(inputobj);
4081
4082 if (newpos<0)
4083 newpos = insize+newpos;
4084 if (newpos<0 || newpos>insize) {
4085 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4086 goto onError;
4087 }
4088
4089 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4090 if (repwstr == NULL)
4091 goto onError;
4092 /* need more space? (at least enough for what we
4093 have+the replacement+the rest of the string (starting
4094 at the new input position), so we won't have to check space
4095 when there are no errors in the rest of the string) */
4096 requiredsize = *outpos + repwlen + insize-newpos;
4097 if (requiredsize > outsize) {
4098 if (requiredsize < 2*outsize)
4099 requiredsize = 2*outsize;
4100 if (unicode_resize(output, requiredsize) < 0)
4101 goto onError;
4102 }
4103 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4104 *outpos += repwlen;
4105
4106 *endinpos = newpos;
4107 *inptr = *input + newpos;
4108
4109 /* we made it! */
4110 Py_XDECREF(restuple);
4111 return 0;
4112
4113 onError:
4114 Py_XDECREF(restuple);
4115 return -1;
4116}
4117#endif /* HAVE_MBCS */
4118
4119static int
4120unicode_decode_call_errorhandler_writer(
4121 const char *errors, PyObject **errorHandler,
4122 const char *encoding, const char *reason,
4123 const char **input, const char **inend, Py_ssize_t *startinpos,
4124 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4125 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4126{
4127 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
4128
4129 PyObject *restuple = NULL;
4130 PyObject *repunicode = NULL;
4131 Py_ssize_t insize;
4132 Py_ssize_t newpos;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004133 Py_ssize_t replen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004134 PyObject *inputobj = NULL;
4135
4136 if (*errorHandler == NULL) {
4137 *errorHandler = PyCodec_LookupError(errors);
4138 if (*errorHandler == NULL)
4139 goto onError;
4140 }
4141
4142 make_decode_exception(exceptionObject,
4143 encoding,
4144 *input, *inend - *input,
4145 *startinpos, *endinpos,
4146 reason);
4147 if (*exceptionObject == NULL)
4148 goto onError;
4149
4150 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4151 if (restuple == NULL)
4152 goto onError;
4153 if (!PyTuple_Check(restuple)) {
4154 PyErr_SetString(PyExc_TypeError, &argparse[4]);
4155 goto onError;
4156 }
4157 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004158 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004159
4160 /* Copy back the bytes variables, which might have been modified by the
4161 callback */
4162 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4163 if (!inputobj)
4164 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00004165 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004166 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00004167 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004168 *input = PyBytes_AS_STRING(inputobj);
4169 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004170 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004171 /* we can DECREF safely, as the exception has another reference,
4172 so the object won't go away. */
4173 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004174
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004175 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004176 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004177 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004178 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4179 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004180 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004181
Victor Stinner8f674cc2013-04-17 23:02:17 +02004182 if (PyUnicode_READY(repunicode) < 0)
4183 goto onError;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004184 replen = PyUnicode_GET_LENGTH(repunicode);
4185 writer->min_length += replen;
4186 if (replen > 1)
Victor Stinner8f674cc2013-04-17 23:02:17 +02004187 writer->overallocate = 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004188 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
Victor Stinner376cfa12013-04-17 23:58:16 +02004189 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004190
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004191 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004192 *inptr = *input + newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004193
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004194 /* we made it! */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004195 Py_XDECREF(restuple);
4196 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004197
Benjamin Peterson29060642009-01-31 22:14:21 +00004198 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004199 Py_XDECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004200 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004201}
4202
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004203/* --- UTF-7 Codec -------------------------------------------------------- */
4204
Antoine Pitrou244651a2009-05-04 18:56:13 +00004205/* See RFC2152 for details. We encode conservatively and decode liberally. */
4206
4207/* Three simple macros defining base-64. */
4208
4209/* Is c a base-64 character? */
4210
4211#define IS_BASE64(c) \
4212 (((c) >= 'A' && (c) <= 'Z') || \
4213 ((c) >= 'a' && (c) <= 'z') || \
4214 ((c) >= '0' && (c) <= '9') || \
4215 (c) == '+' || (c) == '/')
4216
4217/* given that c is a base-64 character, what is its base-64 value? */
4218
4219#define FROM_BASE64(c) \
4220 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4221 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4222 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4223 (c) == '+' ? 62 : 63)
4224
4225/* What is the base-64 character of the bottom 6 bits of n? */
4226
4227#define TO_BASE64(n) \
4228 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4229
4230/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4231 * decoded as itself. We are permissive on decoding; the only ASCII
4232 * byte not decoding to itself is the + which begins a base64
4233 * string. */
4234
4235#define DECODE_DIRECT(c) \
4236 ((c) <= 127 && (c) != '+')
4237
4238/* The UTF-7 encoder treats ASCII characters differently according to
4239 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4240 * the above). See RFC2152. This array identifies these different
4241 * sets:
4242 * 0 : "Set D"
4243 * alphanumeric and '(),-./:?
4244 * 1 : "Set O"
4245 * !"#$%&*;<=>@[]^_`{|}
4246 * 2 : "whitespace"
4247 * ht nl cr sp
4248 * 3 : special (must be base64 encoded)
4249 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4250 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004251
Tim Petersced69f82003-09-16 20:30:58 +00004252static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004253char utf7_category[128] = {
4254/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4255 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4256/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4257 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4258/* sp ! " # $ % & ' ( ) * + , - . / */
4259 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4260/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4261 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4262/* @ A B C D E F G H I J K L M N O */
4263 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4264/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4265 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4266/* ` a b c d e f g h i j k l m n o */
4267 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4268/* p q r s t u v w x y z { | } ~ del */
4269 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004270};
4271
Antoine Pitrou244651a2009-05-04 18:56:13 +00004272/* ENCODE_DIRECT: this character should be encoded as itself. The
4273 * answer depends on whether we are encoding set O as itself, and also
4274 * on whether we are encoding whitespace as itself. RFC2152 makes it
4275 * clear that the answers to these questions vary between
4276 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004277
Antoine Pitrou244651a2009-05-04 18:56:13 +00004278#define ENCODE_DIRECT(c, directO, directWS) \
4279 ((c) < 128 && (c) > 0 && \
4280 ((utf7_category[(c)] == 0) || \
4281 (directWS && (utf7_category[(c)] == 2)) || \
4282 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004283
Alexander Belopolsky40018472011-02-26 01:02:56 +00004284PyObject *
4285PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004286 Py_ssize_t size,
4287 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004288{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004289 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4290}
4291
Antoine Pitrou244651a2009-05-04 18:56:13 +00004292/* The decoder. The only state we preserve is our read position,
4293 * i.e. how many characters we have consumed. So if we end in the
4294 * middle of a shift sequence we have to back off the read position
4295 * and the output to the beginning of the sequence, otherwise we lose
4296 * all the shift state (seen bits, number of bits seen, high
4297 * surrogate). */
4298
Alexander Belopolsky40018472011-02-26 01:02:56 +00004299PyObject *
4300PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004301 Py_ssize_t size,
4302 const char *errors,
4303 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004304{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004305 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004306 Py_ssize_t startinpos;
4307 Py_ssize_t endinpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004308 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004309 _PyUnicodeWriter writer;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004310 const char *errmsg = "";
4311 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004312 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004313 unsigned int base64bits = 0;
4314 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004315 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004316 PyObject *errorHandler = NULL;
4317 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004318
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004319 if (size == 0) {
4320 if (consumed)
4321 *consumed = 0;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004322 _Py_RETURN_UNICODE_EMPTY();
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004323 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004324
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004325 /* Start off assuming it's all ASCII. Widen later as necessary. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02004326 _PyUnicodeWriter_Init(&writer);
4327 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004328
4329 shiftOutStart = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004330 e = s + size;
4331
4332 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004333 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004334 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004335 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004336
Antoine Pitrou244651a2009-05-04 18:56:13 +00004337 if (inShift) { /* in a base-64 section */
4338 if (IS_BASE64(ch)) { /* consume a base-64 character */
4339 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4340 base64bits += 6;
4341 s++;
4342 if (base64bits >= 16) {
4343 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004344 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004345 base64bits -= 16;
4346 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004347 assert(outCh <= 0xffff);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004348 if (surrogate) {
4349 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004350 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4351 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004352 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004353 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004354 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004355 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004356 }
4357 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004358 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004359 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004360 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004361 }
4362 }
Victor Stinner551ac952011-11-29 22:58:13 +01004363 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004364 /* first surrogate */
4365 surrogate = outCh;
4366 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004367 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004368 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004369 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004370 }
4371 }
4372 }
4373 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004374 inShift = 0;
4375 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004376 if (surrogate) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004377 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004378 goto onError;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004379 surrogate = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004380 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004381 if (base64bits > 0) { /* left-over bits */
4382 if (base64bits >= 6) {
4383 /* We've seen at least one base-64 character */
4384 errmsg = "partial character in shift sequence";
4385 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004386 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004387 else {
4388 /* Some bits remain; they should be zero */
4389 if (base64buffer != 0) {
4390 errmsg = "non-zero padding bits in shift sequence";
4391 goto utf7Error;
4392 }
4393 }
4394 }
4395 if (ch != '-') {
4396 /* '-' is absorbed; other terminating
4397 characters are preserved */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004398 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004399 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004400 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004401 }
4402 }
4403 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004404 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004405 s++; /* consume '+' */
4406 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004407 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004408 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004409 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004410 }
4411 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004412 inShift = 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004413 shiftOutStart = writer.pos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004414 base64bits = 0;
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004415 base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004416 }
4417 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004418 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004419 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004420 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004421 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004422 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004423 else {
4424 startinpos = s-starts;
4425 s++;
4426 errmsg = "unexpected special character";
4427 goto utf7Error;
4428 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004429 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004430utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004431 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004432 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004433 errors, &errorHandler,
4434 "utf7", errmsg,
4435 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004436 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004437 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004438 }
4439
Antoine Pitrou244651a2009-05-04 18:56:13 +00004440 /* end of string */
4441
4442 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4443 /* if we're in an inconsistent state, that's an error */
4444 if (surrogate ||
4445 (base64bits >= 6) ||
4446 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004447 endinpos = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004448 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrou244651a2009-05-04 18:56:13 +00004449 errors, &errorHandler,
4450 "utf7", "unterminated shift sequence",
4451 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004452 &writer))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004453 goto onError;
4454 if (s < e)
4455 goto restart;
4456 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004457 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004458
4459 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004460 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004461 if (inShift) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004462 writer.pos = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004463 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004464 }
4465 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004466 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004467 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004468 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004469
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004470 Py_XDECREF(errorHandler);
4471 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004472 return _PyUnicodeWriter_Finish(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004473
Benjamin Peterson29060642009-01-31 22:14:21 +00004474 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004475 Py_XDECREF(errorHandler);
4476 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004477 _PyUnicodeWriter_Dealloc(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004478 return NULL;
4479}
4480
4481
Alexander Belopolsky40018472011-02-26 01:02:56 +00004482PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004483_PyUnicode_EncodeUTF7(PyObject *str,
4484 int base64SetO,
4485 int base64WhiteSpace,
4486 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004487{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004488 int kind;
4489 void *data;
4490 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004491 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004492 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004493 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004494 unsigned int base64bits = 0;
4495 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004496 char * out;
4497 char * start;
4498
Benjamin Petersonbac79492012-01-14 13:34:47 -05004499 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004500 return NULL;
4501 kind = PyUnicode_KIND(str);
4502 data = PyUnicode_DATA(str);
4503 len = PyUnicode_GET_LENGTH(str);
4504
4505 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004506 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004507
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004508 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004509 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004510 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004511 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004512 if (v == NULL)
4513 return NULL;
4514
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004515 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004516 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004517 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004518
Antoine Pitrou244651a2009-05-04 18:56:13 +00004519 if (inShift) {
4520 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4521 /* shifting out */
4522 if (base64bits) { /* output remaining bits */
4523 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4524 base64buffer = 0;
4525 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004526 }
4527 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004528 /* Characters not in the BASE64 set implicitly unshift the sequence
4529 so no '-' is required, except if the character is itself a '-' */
4530 if (IS_BASE64(ch) || ch == '-') {
4531 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004532 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004533 *out++ = (char) ch;
4534 }
4535 else {
4536 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004537 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004538 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004539 else { /* not in a shift sequence */
4540 if (ch == '+') {
4541 *out++ = '+';
4542 *out++ = '-';
4543 }
4544 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4545 *out++ = (char) ch;
4546 }
4547 else {
4548 *out++ = '+';
4549 inShift = 1;
4550 goto encode_char;
4551 }
4552 }
4553 continue;
4554encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004555 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004556 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004557
Antoine Pitrou244651a2009-05-04 18:56:13 +00004558 /* code first surrogate */
4559 base64bits += 16;
Victor Stinner76df43d2012-10-30 01:42:39 +01004560 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004561 while (base64bits >= 6) {
4562 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4563 base64bits -= 6;
4564 }
4565 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004566 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004567 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004568 base64bits += 16;
4569 base64buffer = (base64buffer << 16) | ch;
4570 while (base64bits >= 6) {
4571 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4572 base64bits -= 6;
4573 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004574 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004575 if (base64bits)
4576 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4577 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004578 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004579 if (_PyBytes_Resize(&v, out - start) < 0)
4580 return NULL;
4581 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004582}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004583PyObject *
4584PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4585 Py_ssize_t size,
4586 int base64SetO,
4587 int base64WhiteSpace,
4588 const char *errors)
4589{
4590 PyObject *result;
4591 PyObject *tmp = PyUnicode_FromUnicode(s, size);
4592 if (tmp == NULL)
4593 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004594 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004595 base64WhiteSpace, errors);
4596 Py_DECREF(tmp);
4597 return result;
4598}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004599
Antoine Pitrou244651a2009-05-04 18:56:13 +00004600#undef IS_BASE64
4601#undef FROM_BASE64
4602#undef TO_BASE64
4603#undef DECODE_DIRECT
4604#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004605
Guido van Rossumd57fd912000-03-10 22:53:23 +00004606/* --- UTF-8 Codec -------------------------------------------------------- */
4607
Alexander Belopolsky40018472011-02-26 01:02:56 +00004608PyObject *
4609PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004610 Py_ssize_t size,
4611 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004612{
Walter Dörwald69652032004-09-07 20:24:22 +00004613 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4614}
4615
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004616#include "stringlib/asciilib.h"
4617#include "stringlib/codecs.h"
4618#include "stringlib/undef.h"
4619
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004620#include "stringlib/ucs1lib.h"
4621#include "stringlib/codecs.h"
4622#include "stringlib/undef.h"
4623
4624#include "stringlib/ucs2lib.h"
4625#include "stringlib/codecs.h"
4626#include "stringlib/undef.h"
4627
4628#include "stringlib/ucs4lib.h"
4629#include "stringlib/codecs.h"
4630#include "stringlib/undef.h"
4631
Antoine Pitrouab868312009-01-10 15:40:25 +00004632/* Mask to quickly check whether a C 'long' contains a
4633 non-ASCII, UTF8-encoded char. */
4634#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004635# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004636#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004637# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004638#else
4639# error C 'long' size should be either 4 or 8!
4640#endif
4641
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004642static Py_ssize_t
4643ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004644{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004645 const char *p = start;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004646 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004647
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004648 /*
4649 * Issue #17237: m68k is a bit different from most architectures in
4650 * that objects do not use "natural alignment" - for example, int and
4651 * long are only aligned at 2-byte boundaries. Therefore the assert()
4652 * won't work; also, tests have shown that skipping the "optimised
4653 * version" will even speed up m68k.
4654 */
4655#if !defined(__m68k__)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004656#if SIZEOF_LONG <= SIZEOF_VOID_P
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004657 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4658 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004659 /* Fast path, see in STRINGLIB(utf8_decode) for
4660 an explanation. */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004661 /* Help allocation */
4662 const char *_p = p;
4663 Py_UCS1 * q = dest;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004664 while (_p < aligned_end) {
4665 unsigned long value = *(const unsigned long *) _p;
4666 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00004667 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004668 *((unsigned long *)q) = value;
4669 _p += SIZEOF_LONG;
4670 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004671 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004672 p = _p;
4673 while (p < end) {
4674 if ((unsigned char)*p & 0x80)
4675 break;
4676 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004677 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004678 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004679 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004680#endif
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004681#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004682 while (p < end) {
4683 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4684 for an explanation. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004685 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004686 /* Help allocation */
4687 const char *_p = p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004688 while (_p < aligned_end) {
4689 unsigned long value = *(unsigned long *) _p;
4690 if (value & ASCII_CHAR_MASK)
4691 break;
4692 _p += SIZEOF_LONG;
4693 }
4694 p = _p;
4695 if (_p == end)
4696 break;
4697 }
4698 if ((unsigned char)*p & 0x80)
4699 break;
4700 ++p;
4701 }
4702 memcpy(dest, start, p - start);
4703 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004704}
Antoine Pitrouab868312009-01-10 15:40:25 +00004705
Victor Stinner785938e2011-12-11 20:09:03 +01004706PyObject *
4707PyUnicode_DecodeUTF8Stateful(const char *s,
4708 Py_ssize_t size,
4709 const char *errors,
4710 Py_ssize_t *consumed)
4711{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004712 _PyUnicodeWriter writer;
Victor Stinner785938e2011-12-11 20:09:03 +01004713 const char *starts = s;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004714 const char *end = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004715
4716 Py_ssize_t startinpos;
4717 Py_ssize_t endinpos;
4718 const char *errmsg = "";
4719 PyObject *errorHandler = NULL;
4720 PyObject *exc = NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004721
4722 if (size == 0) {
4723 if (consumed)
4724 *consumed = 0;
Serhiy Storchaka678db842013-01-26 12:16:36 +02004725 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner785938e2011-12-11 20:09:03 +01004726 }
4727
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004728 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4729 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner785938e2011-12-11 20:09:03 +01004730 if (consumed)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004731 *consumed = 1;
4732 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01004733 }
4734
Victor Stinner8f674cc2013-04-17 23:02:17 +02004735 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02004736 writer.min_length = size;
4737 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004738 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004739
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004740 writer.pos = ascii_decode(s, end, writer.data);
4741 s += writer.pos;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004742 while (s < end) {
4743 Py_UCS4 ch;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004744 int kind = writer.kind;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004745 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004746 if (PyUnicode_IS_ASCII(writer.buffer))
4747 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004748 else
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004749 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004750 } else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004751 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004752 } else {
4753 assert(kind == PyUnicode_4BYTE_KIND);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004754 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004755 }
4756
4757 switch (ch) {
4758 case 0:
4759 if (s == end || consumed)
4760 goto End;
4761 errmsg = "unexpected end of data";
4762 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004763 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004764 break;
4765 case 1:
4766 errmsg = "invalid start byte";
4767 startinpos = s - starts;
4768 endinpos = startinpos + 1;
4769 break;
4770 case 2:
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004771 case 3:
4772 case 4:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004773 errmsg = "invalid continuation byte";
4774 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004775 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004776 break;
4777 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004778 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004779 goto onError;
4780 continue;
4781 }
4782
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004783 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004784 errors, &errorHandler,
4785 "utf-8", errmsg,
4786 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004787 &writer))
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004788 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004789 }
4790
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004791End:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004792 if (consumed)
4793 *consumed = s - starts;
4794
4795 Py_XDECREF(errorHandler);
4796 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004797 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004798
4799onError:
4800 Py_XDECREF(errorHandler);
4801 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004802 _PyUnicodeWriter_Dealloc(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004803 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004804}
4805
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004806#ifdef __APPLE__
4807
4808/* Simplified UTF-8 decoder using surrogateescape error handler,
Victor Stinner0d92c4f2012-11-12 23:32:21 +01004809 used to decode the command line arguments on Mac OS X.
4810
4811 Return a pointer to a newly allocated wide character string (use
Victor Stinner6f8eeee2013-07-07 22:57:45 +02004812 PyMem_RawFree() to free the memory), or NULL on memory allocation error. */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004813
4814wchar_t*
4815_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4816{
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004817 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004818 wchar_t *unicode;
4819 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004820
4821 /* Note: size will always be longer than the resulting Unicode
4822 character count */
Victor Stinner0d92c4f2012-11-12 23:32:21 +01004823 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1))
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004824 return NULL;
Victor Stinner6f8eeee2013-07-07 22:57:45 +02004825 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004826 if (!unicode)
4827 return NULL;
4828
4829 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004830 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004831 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004832 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004833 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004834#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004835 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004836#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004837 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004838#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004839 if (ch > 0xFF) {
4840#if SIZEOF_WCHAR_T == 4
4841 assert(0);
4842#else
4843 assert(Py_UNICODE_IS_SURROGATE(ch));
4844 /* compute and append the two surrogates: */
4845 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
4846 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
4847#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004848 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004849 else {
4850 if (!ch && s == e)
4851 break;
4852 /* surrogateescape */
4853 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
4854 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004855 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004856 unicode[outpos] = L'\0';
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004857 return unicode;
4858}
4859
4860#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00004861
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004862/* Primary internal function which creates utf8 encoded bytes objects.
4863
4864 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00004865 and allocate exactly as much space needed at the end. Else allocate the
4866 maximum possible needed (4 result bytes per Unicode character), and return
4867 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004868*/
Tim Peters7e3d9612002-04-21 03:26:37 +00004869PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01004870_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004871{
Victor Stinner6099a032011-12-18 14:22:26 +01004872 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004873 void *data;
4874 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00004875
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004876 if (!PyUnicode_Check(unicode)) {
4877 PyErr_BadArgument();
4878 return NULL;
4879 }
4880
4881 if (PyUnicode_READY(unicode) == -1)
4882 return NULL;
4883
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004884 if (PyUnicode_UTF8(unicode))
4885 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4886 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004887
4888 kind = PyUnicode_KIND(unicode);
4889 data = PyUnicode_DATA(unicode);
4890 size = PyUnicode_GET_LENGTH(unicode);
4891
Benjamin Petersonead6b532011-12-20 17:23:42 -06004892 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01004893 default:
4894 assert(0);
4895 case PyUnicode_1BYTE_KIND:
4896 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
4897 assert(!PyUnicode_IS_ASCII(unicode));
4898 return ucs1lib_utf8_encoder(unicode, data, size, errors);
4899 case PyUnicode_2BYTE_KIND:
4900 return ucs2lib_utf8_encoder(unicode, data, size, errors);
4901 case PyUnicode_4BYTE_KIND:
4902 return ucs4lib_utf8_encoder(unicode, data, size, errors);
Tim Peters602f7402002-04-27 18:03:26 +00004903 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004904}
4905
Alexander Belopolsky40018472011-02-26 01:02:56 +00004906PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004907PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4908 Py_ssize_t size,
4909 const char *errors)
4910{
4911 PyObject *v, *unicode;
4912
4913 unicode = PyUnicode_FromUnicode(s, size);
4914 if (unicode == NULL)
4915 return NULL;
4916 v = _PyUnicode_AsUTF8String(unicode, errors);
4917 Py_DECREF(unicode);
4918 return v;
4919}
4920
4921PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004922PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004923{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004924 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004925}
4926
Walter Dörwald41980ca2007-08-16 21:55:45 +00004927/* --- UTF-32 Codec ------------------------------------------------------- */
4928
4929PyObject *
4930PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004931 Py_ssize_t size,
4932 const char *errors,
4933 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004934{
4935 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4936}
4937
4938PyObject *
4939PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004940 Py_ssize_t size,
4941 const char *errors,
4942 int *byteorder,
4943 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004944{
4945 const char *starts = s;
4946 Py_ssize_t startinpos;
4947 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004948 _PyUnicodeWriter writer;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004949 const unsigned char *q, *e;
Victor Stinnere64322e2012-10-30 23:12:47 +01004950 int le, bo = 0; /* assume native ordering by default */
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02004951 const char *encoding;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004952 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004953 PyObject *errorHandler = NULL;
4954 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004955
Walter Dörwald41980ca2007-08-16 21:55:45 +00004956 q = (unsigned char *)s;
4957 e = q + size;
4958
4959 if (byteorder)
4960 bo = *byteorder;
4961
4962 /* Check for BOM marks (U+FEFF) in the input and adjust current
4963 byte order setting accordingly. In native mode, the leading BOM
4964 mark is skipped, in all other modes, it is copied to the output
4965 stream as-is (giving a ZWNBSP character). */
Victor Stinnere64322e2012-10-30 23:12:47 +01004966 if (bo == 0 && size >= 4) {
4967 Py_UCS4 bom = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
4968 if (bom == 0x0000FEFF) {
4969 bo = -1;
4970 q += 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00004971 }
Victor Stinnere64322e2012-10-30 23:12:47 +01004972 else if (bom == 0xFFFE0000) {
4973 bo = 1;
4974 q += 4;
4975 }
4976 if (byteorder)
4977 *byteorder = bo;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004978 }
4979
Victor Stinnere64322e2012-10-30 23:12:47 +01004980 if (q == e) {
4981 if (consumed)
4982 *consumed = size;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004983 _Py_RETURN_UNICODE_EMPTY();
Walter Dörwald41980ca2007-08-16 21:55:45 +00004984 }
4985
Victor Stinnere64322e2012-10-30 23:12:47 +01004986#ifdef WORDS_BIGENDIAN
4987 le = bo < 0;
4988#else
4989 le = bo <= 0;
4990#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02004991 encoding = le ? "utf-32-le" : "utf-32-be";
Victor Stinnere64322e2012-10-30 23:12:47 +01004992
Victor Stinner8f674cc2013-04-17 23:02:17 +02004993 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02004994 writer.min_length = (e - q + 3) / 4;
4995 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004996 goto onError;
Victor Stinnere64322e2012-10-30 23:12:47 +01004997
Victor Stinnere64322e2012-10-30 23:12:47 +01004998 while (1) {
4999 Py_UCS4 ch = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005000 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005001
Victor Stinnere64322e2012-10-30 23:12:47 +01005002 if (e - q >= 4) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005003 enum PyUnicode_Kind kind = writer.kind;
5004 void *data = writer.data;
Victor Stinnere64322e2012-10-30 23:12:47 +01005005 const unsigned char *last = e - 4;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005006 Py_ssize_t pos = writer.pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005007 if (le) {
5008 do {
5009 ch = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5010 if (ch > maxch)
5011 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005012 if (kind != PyUnicode_1BYTE_KIND &&
5013 Py_UNICODE_IS_SURROGATE(ch))
5014 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005015 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005016 q += 4;
5017 } while (q <= last);
5018 }
5019 else {
5020 do {
5021 ch = (q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
5022 if (ch > maxch)
5023 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005024 if (kind != PyUnicode_1BYTE_KIND &&
5025 Py_UNICODE_IS_SURROGATE(ch))
5026 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005027 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005028 q += 4;
5029 } while (q <= last);
5030 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005031 writer.pos = pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005032 }
5033
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005034 if (Py_UNICODE_IS_SURROGATE(ch)) {
5035 errmsg = "codepoint in surrogate code point range(0xd800, 0xe000)";
5036 startinpos = ((const char *)q) - starts;
5037 endinpos = startinpos + 4;
5038 }
5039 else if (ch <= maxch) {
Victor Stinnere64322e2012-10-30 23:12:47 +01005040 if (q == e || consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005041 break;
Victor Stinnere64322e2012-10-30 23:12:47 +01005042 /* remaining bytes at the end? (size should be divisible by 4) */
Benjamin Peterson29060642009-01-31 22:14:21 +00005043 errmsg = "truncated data";
Victor Stinnere64322e2012-10-30 23:12:47 +01005044 startinpos = ((const char *)q) - starts;
5045 endinpos = ((const char *)e) - starts;
Benjamin Peterson29060642009-01-31 22:14:21 +00005046 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005047 else {
5048 if (ch < 0x110000) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005049 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnere64322e2012-10-30 23:12:47 +01005050 goto onError;
5051 q += 4;
5052 continue;
5053 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005054 errmsg = "codepoint not in range(0x110000)";
Victor Stinnere64322e2012-10-30 23:12:47 +01005055 startinpos = ((const char *)q) - starts;
5056 endinpos = startinpos + 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005057 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005058
5059 /* The remaining input chars are ignored if the callback
5060 chooses to skip the input */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005061 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005062 errors, &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005063 encoding, errmsg,
Benjamin Peterson29060642009-01-31 22:14:21 +00005064 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005065 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005066 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005067 }
5068
Walter Dörwald41980ca2007-08-16 21:55:45 +00005069 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005070 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005071
Walter Dörwald41980ca2007-08-16 21:55:45 +00005072 Py_XDECREF(errorHandler);
5073 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005074 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005075
Benjamin Peterson29060642009-01-31 22:14:21 +00005076 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005077 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005078 Py_XDECREF(errorHandler);
5079 Py_XDECREF(exc);
5080 return NULL;
5081}
5082
5083PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005084_PyUnicode_EncodeUTF32(PyObject *str,
5085 const char *errors,
5086 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005087{
Serhiy Storchaka30793282014-01-04 22:44:01 +02005088 int kind;
5089 void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005090 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005091 PyObject *v;
Serhiy Storchaka30793282014-01-04 22:44:01 +02005092 unsigned char *p;
5093 Py_ssize_t nsize, i;
5094 /* Offsets from p for storing byte pairs in the right order. */
Christian Heimes743e0cd2012-10-17 23:52:17 +02005095#if PY_LITTLE_ENDIAN
Serhiy Storchaka30793282014-01-04 22:44:01 +02005096 int iorder[] = {0, 1, 2, 3};
Walter Dörwald41980ca2007-08-16 21:55:45 +00005097#else
Serhiy Storchaka30793282014-01-04 22:44:01 +02005098 int iorder[] = {3, 2, 1, 0};
Walter Dörwald41980ca2007-08-16 21:55:45 +00005099#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005100 const char *encoding;
5101 PyObject *errorHandler = NULL;
5102 PyObject *exc = NULL;
5103 PyObject *rep = NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005104
Serhiy Storchaka30793282014-01-04 22:44:01 +02005105#define STORECHAR(CH) \
5106 do { \
5107 p[iorder[3]] = ((CH) >> 24) & 0xff; \
5108 p[iorder[2]] = ((CH) >> 16) & 0xff; \
5109 p[iorder[1]] = ((CH) >> 8) & 0xff; \
5110 p[iorder[0]] = (CH) & 0xff; \
5111 p += 4; \
5112 } while(0)
5113
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005114 if (!PyUnicode_Check(str)) {
5115 PyErr_BadArgument();
5116 return NULL;
5117 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005118 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005119 return NULL;
5120 kind = PyUnicode_KIND(str);
5121 data = PyUnicode_DATA(str);
5122 len = PyUnicode_GET_LENGTH(str);
5123
Serhiy Storchaka583a9392014-01-04 19:25:37 +02005124 nsize = len + (byteorder == 0);
Serhiy Storchaka30793282014-01-04 22:44:01 +02005125 if (nsize > PY_SSIZE_T_MAX / 4)
5126 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005127 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005128 if (v == NULL)
5129 return NULL;
5130
Serhiy Storchaka30793282014-01-04 22:44:01 +02005131 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005132 if (byteorder == 0)
Serhiy Storchaka30793282014-01-04 22:44:01 +02005133 STORECHAR(0xFEFF);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005134 if (len == 0)
Serhiy Storchaka30793282014-01-04 22:44:01 +02005135 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005136
Serhiy Storchaka30793282014-01-04 22:44:01 +02005137 if (byteorder == -1) {
5138 /* force LE */
5139 iorder[0] = 0;
5140 iorder[1] = 1;
5141 iorder[2] = 2;
5142 iorder[3] = 3;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005143 encoding = "utf-32-le";
Serhiy Storchaka30793282014-01-04 22:44:01 +02005144 }
5145 else if (byteorder == 1) {
5146 /* force BE */
5147 iorder[0] = 3;
5148 iorder[1] = 2;
5149 iorder[2] = 1;
5150 iorder[3] = 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005151 encoding = "utf-32-be";
Serhiy Storchaka30793282014-01-04 22:44:01 +02005152 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005153 else
5154 encoding = "utf-32";
5155
5156 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchaka30793282014-01-04 22:44:01 +02005157 for (i = 0; i < len; i++)
5158 STORECHAR(PyUnicode_READ(kind, data, i));
5159 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005160 }
5161
Serhiy Storchaka30793282014-01-04 22:44:01 +02005162 for (i = 0; i < len;) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005163 Py_ssize_t repsize, moreunits;
Serhiy Storchaka30793282014-01-04 22:44:01 +02005164 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
5165 i++;
5166 assert(ch <= MAX_UNICODE);
5167 if (!Py_UNICODE_IS_SURROGATE(ch)) {
5168 STORECHAR(ch);
5169 continue;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005170 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005171
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005172 rep = unicode_encode_call_errorhandler(
5173 errors, &errorHandler,
5174 encoding, "surrogates not allowed",
Serhiy Storchaka30793282014-01-04 22:44:01 +02005175 str, &exc, i-1, i, &i);
5176
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005177 if (!rep)
5178 goto error;
5179
5180 if (PyBytes_Check(rep)) {
5181 repsize = PyBytes_GET_SIZE(rep);
5182 if (repsize & 3) {
5183 raise_encode_exception(&exc, encoding,
Serhiy Storchaka30793282014-01-04 22:44:01 +02005184 str, i - 1, i,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005185 "surrogates not allowed");
5186 goto error;
5187 }
5188 moreunits = repsize / 4;
5189 }
5190 else {
5191 assert(PyUnicode_Check(rep));
5192 if (PyUnicode_READY(rep) < 0)
5193 goto error;
5194 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5195 if (!PyUnicode_IS_ASCII(rep)) {
5196 raise_encode_exception(&exc, encoding,
Serhiy Storchaka30793282014-01-04 22:44:01 +02005197 str, i - 1, i,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005198 "surrogates not allowed");
5199 goto error;
5200 }
5201 }
5202
5203 /* four bytes are reserved for each surrogate */
5204 if (moreunits > 1) {
Serhiy Storchaka30793282014-01-04 22:44:01 +02005205 Py_ssize_t outpos = p - (unsigned char*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005206 Py_ssize_t morebytes = 4 * (moreunits - 1);
5207 if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) {
5208 /* integer overflow */
5209 PyErr_NoMemory();
5210 goto error;
5211 }
5212 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0)
5213 goto error;
Serhiy Storchaka30793282014-01-04 22:44:01 +02005214 p = (unsigned char*) PyBytes_AS_STRING(v) + outpos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005215 }
5216
5217 if (PyBytes_Check(rep)) {
Serhiy Storchaka30793282014-01-04 22:44:01 +02005218 Py_MEMCPY(p, PyBytes_AS_STRING(rep), repsize);
5219 p += repsize;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005220 } else /* rep is unicode */ {
Serhiy Storchaka30793282014-01-04 22:44:01 +02005221 const Py_UCS1 *repdata;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005222 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
Serhiy Storchaka30793282014-01-04 22:44:01 +02005223 repdata = PyUnicode_1BYTE_DATA(rep);
5224 while (repsize--) {
5225 Py_UCS4 ch = *repdata++;
5226 STORECHAR(ch);
5227 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005228 }
5229
5230 Py_CLEAR(rep);
5231 }
5232
5233 /* Cut back to size actually needed. This is necessary for, for example,
5234 encoding of a string containing isolated surrogates and the 'ignore'
5235 handler is used. */
Serhiy Storchaka30793282014-01-04 22:44:01 +02005236 nsize = p - (unsigned char*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005237 if (nsize != PyBytes_GET_SIZE(v))
5238 _PyBytes_Resize(&v, nsize);
5239 Py_XDECREF(errorHandler);
5240 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005241 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005242 error:
5243 Py_XDECREF(rep);
5244 Py_XDECREF(errorHandler);
5245 Py_XDECREF(exc);
5246 Py_XDECREF(v);
5247 return NULL;
Serhiy Storchaka30793282014-01-04 22:44:01 +02005248#undef STORECHAR
Walter Dörwald41980ca2007-08-16 21:55:45 +00005249}
5250
Alexander Belopolsky40018472011-02-26 01:02:56 +00005251PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005252PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5253 Py_ssize_t size,
5254 const char *errors,
5255 int byteorder)
5256{
5257 PyObject *result;
5258 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5259 if (tmp == NULL)
5260 return NULL;
5261 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5262 Py_DECREF(tmp);
5263 return result;
5264}
5265
5266PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005267PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005268{
Victor Stinnerb960b342011-11-20 19:12:52 +01005269 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005270}
5271
Guido van Rossumd57fd912000-03-10 22:53:23 +00005272/* --- UTF-16 Codec ------------------------------------------------------- */
5273
Tim Peters772747b2001-08-09 22:21:55 +00005274PyObject *
5275PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005276 Py_ssize_t size,
5277 const char *errors,
5278 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005279{
Walter Dörwald69652032004-09-07 20:24:22 +00005280 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5281}
5282
5283PyObject *
5284PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005285 Py_ssize_t size,
5286 const char *errors,
5287 int *byteorder,
5288 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005289{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005290 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005291 Py_ssize_t startinpos;
5292 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005293 _PyUnicodeWriter writer;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005294 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005295 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005296 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005297 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005298 PyObject *errorHandler = NULL;
5299 PyObject *exc = NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005300 const char *encoding;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005301
Tim Peters772747b2001-08-09 22:21:55 +00005302 q = (unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005303 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005304
5305 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005306 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005307
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005308 /* Check for BOM marks (U+FEFF) in the input and adjust current
5309 byte order setting accordingly. In native mode, the leading BOM
5310 mark is skipped, in all other modes, it is copied to the output
5311 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005312 if (bo == 0 && size >= 2) {
5313 const Py_UCS4 bom = (q[1] << 8) | q[0];
5314 if (bom == 0xFEFF) {
5315 q += 2;
5316 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005317 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005318 else if (bom == 0xFFFE) {
5319 q += 2;
5320 bo = 1;
5321 }
5322 if (byteorder)
5323 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005324 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005325
Antoine Pitrou63065d72012-05-15 23:48:04 +02005326 if (q == e) {
5327 if (consumed)
5328 *consumed = size;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005329 _Py_RETURN_UNICODE_EMPTY();
Tim Peters772747b2001-08-09 22:21:55 +00005330 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005331
Christian Heimes743e0cd2012-10-17 23:52:17 +02005332#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02005333 native_ordering = bo <= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005334 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
Antoine Pitrouab868312009-01-10 15:40:25 +00005335#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02005336 native_ordering = bo >= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005337 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
Antoine Pitrouab868312009-01-10 15:40:25 +00005338#endif
Tim Peters772747b2001-08-09 22:21:55 +00005339
Antoine Pitrou63065d72012-05-15 23:48:04 +02005340 /* Note: size will always be longer than the resulting Unicode
5341 character count */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005342 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005343 writer.min_length = (e - q + 1) / 2;
5344 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005345 goto onError;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005346
Antoine Pitrou63065d72012-05-15 23:48:04 +02005347 while (1) {
5348 Py_UCS4 ch = 0;
5349 if (e - q >= 2) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005350 int kind = writer.kind;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005351 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005352 if (PyUnicode_IS_ASCII(writer.buffer))
Antoine Pitrou63065d72012-05-15 23:48:04 +02005353 ch = asciilib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005354 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005355 native_ordering);
5356 else
5357 ch = ucs1lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005358 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005359 native_ordering);
5360 } else if (kind == PyUnicode_2BYTE_KIND) {
5361 ch = ucs2lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005362 (Py_UCS2*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005363 native_ordering);
5364 } else {
5365 assert(kind == PyUnicode_4BYTE_KIND);
5366 ch = ucs4lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005367 (Py_UCS4*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005368 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00005369 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005370 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005371
Antoine Pitrou63065d72012-05-15 23:48:04 +02005372 switch (ch)
5373 {
5374 case 0:
5375 /* remaining byte at the end? (size should be even) */
5376 if (q == e || consumed)
5377 goto End;
5378 errmsg = "truncated data";
5379 startinpos = ((const char *)q) - starts;
5380 endinpos = ((const char *)e) - starts;
5381 break;
5382 /* The remaining input chars are ignored if the callback
5383 chooses to skip the input */
5384 case 1:
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005385 q -= 2;
5386 if (consumed)
Serhiy Storchakaae3b32a2013-01-08 23:40:52 +02005387 goto End;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005388 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005389 startinpos = ((const char *)q) - starts;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005390 endinpos = ((const char *)e) - starts;
5391 break;
5392 case 2:
5393 errmsg = "illegal encoding";
5394 startinpos = ((const char *)q) - 2 - starts;
5395 endinpos = startinpos + 2;
5396 break;
5397 case 3:
5398 errmsg = "illegal UTF-16 surrogate";
5399 startinpos = ((const char *)q) - 4 - starts;
5400 endinpos = startinpos + 2;
5401 break;
5402 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005403 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005404 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005405 continue;
5406 }
5407
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005408 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouab868312009-01-10 15:40:25 +00005409 errors,
5410 &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005411 encoding, errmsg,
Antoine Pitrouab868312009-01-10 15:40:25 +00005412 &starts,
5413 (const char **)&e,
5414 &startinpos,
5415 &endinpos,
5416 &exc,
5417 (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005418 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005419 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005420 }
5421
Antoine Pitrou63065d72012-05-15 23:48:04 +02005422End:
Walter Dörwald69652032004-09-07 20:24:22 +00005423 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005424 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005425
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005426 Py_XDECREF(errorHandler);
5427 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005428 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005429
Benjamin Peterson29060642009-01-31 22:14:21 +00005430 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005431 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005432 Py_XDECREF(errorHandler);
5433 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005434 return NULL;
5435}
5436
Tim Peters772747b2001-08-09 22:21:55 +00005437PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005438_PyUnicode_EncodeUTF16(PyObject *str,
5439 const char *errors,
5440 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005441{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005442 enum PyUnicode_Kind kind;
5443 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005444 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005445 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005446 unsigned short *out;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005447 Py_ssize_t pairs;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005448#if PY_BIG_ENDIAN
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005449 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005450#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005451 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005452#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005453 const char *encoding;
5454 Py_ssize_t nsize, pos;
5455 PyObject *errorHandler = NULL;
5456 PyObject *exc = NULL;
5457 PyObject *rep = NULL;
Tim Peters772747b2001-08-09 22:21:55 +00005458
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005459 if (!PyUnicode_Check(str)) {
5460 PyErr_BadArgument();
5461 return NULL;
5462 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005463 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005464 return NULL;
5465 kind = PyUnicode_KIND(str);
5466 data = PyUnicode_DATA(str);
5467 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005468
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005469 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005470 if (kind == PyUnicode_4BYTE_KIND) {
5471 const Py_UCS4 *in = (const Py_UCS4 *)data;
5472 const Py_UCS4 *end = in + len;
5473 while (in < end)
5474 if (*in++ >= 0x10000)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005475 pairs++;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005476 }
5477 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005478 return PyErr_NoMemory();
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005479 nsize = len + pairs + (byteorder == 0);
5480 v = PyBytes_FromStringAndSize(NULL, nsize * 2);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005481 if (v == NULL)
5482 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005483
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005484 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02005485 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005486 out = (unsigned short *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005487 if (byteorder == 0)
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005488 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005489 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005490 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005491
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005492 if (kind == PyUnicode_1BYTE_KIND) {
5493 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5494 goto done;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005495 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005496
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005497 if (byteorder < 0)
5498 encoding = "utf-16-le";
5499 else if (byteorder > 0)
5500 encoding = "utf-16-be";
5501 else
5502 encoding = "utf-16";
5503
5504 pos = 0;
5505 while (pos < len) {
5506 Py_ssize_t repsize, moreunits;
5507
5508 if (kind == PyUnicode_2BYTE_KIND) {
5509 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
5510 &out, native_ordering);
5511 }
5512 else {
5513 assert(kind == PyUnicode_4BYTE_KIND);
5514 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
5515 &out, native_ordering);
5516 }
5517 if (pos == len)
5518 break;
5519
5520 rep = unicode_encode_call_errorhandler(
5521 errors, &errorHandler,
5522 encoding, "surrogates not allowed",
5523 str, &exc, pos, pos + 1, &pos);
5524 if (!rep)
5525 goto error;
5526
5527 if (PyBytes_Check(rep)) {
5528 repsize = PyBytes_GET_SIZE(rep);
5529 if (repsize & 1) {
5530 raise_encode_exception(&exc, encoding,
5531 str, pos - 1, pos,
5532 "surrogates not allowed");
5533 goto error;
5534 }
5535 moreunits = repsize / 2;
5536 }
5537 else {
5538 assert(PyUnicode_Check(rep));
5539 if (PyUnicode_READY(rep) < 0)
5540 goto error;
5541 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5542 if (!PyUnicode_IS_ASCII(rep)) {
5543 raise_encode_exception(&exc, encoding,
5544 str, pos - 1, pos,
5545 "surrogates not allowed");
5546 goto error;
5547 }
5548 }
5549
5550 /* two bytes are reserved for each surrogate */
5551 if (moreunits > 1) {
5552 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
5553 Py_ssize_t morebytes = 2 * (moreunits - 1);
5554 if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) {
5555 /* integer overflow */
5556 PyErr_NoMemory();
5557 goto error;
5558 }
5559 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0)
5560 goto error;
5561 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
5562 }
5563
5564 if (PyBytes_Check(rep)) {
5565 Py_MEMCPY(out, PyBytes_AS_STRING(rep), repsize);
5566 out += moreunits;
5567 } else /* rep is unicode */ {
5568 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
5569 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5570 &out, native_ordering);
5571 }
5572
5573 Py_CLEAR(rep);
5574 }
5575
5576 /* Cut back to size actually needed. This is necessary for, for example,
5577 encoding of a string containing isolated surrogates and the 'ignore' handler
5578 is used. */
5579 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
5580 if (nsize != PyBytes_GET_SIZE(v))
5581 _PyBytes_Resize(&v, nsize);
5582 Py_XDECREF(errorHandler);
5583 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00005584 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005585 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005586 error:
5587 Py_XDECREF(rep);
5588 Py_XDECREF(errorHandler);
5589 Py_XDECREF(exc);
5590 Py_XDECREF(v);
5591 return NULL;
5592#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005593}
5594
Alexander Belopolsky40018472011-02-26 01:02:56 +00005595PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005596PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5597 Py_ssize_t size,
5598 const char *errors,
5599 int byteorder)
5600{
5601 PyObject *result;
5602 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5603 if (tmp == NULL)
5604 return NULL;
5605 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5606 Py_DECREF(tmp);
5607 return result;
5608}
5609
5610PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005611PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005612{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005613 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005614}
5615
5616/* --- Unicode Escape Codec ----------------------------------------------- */
5617
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005618/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5619 if all the escapes in the string make it still a valid ASCII string.
5620 Returns -1 if any escapes were found which cause the string to
5621 pop out of ASCII range. Otherwise returns the length of the
5622 required buffer to hold the string.
5623 */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02005624static Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005625length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5626{
5627 const unsigned char *p = (const unsigned char *)s;
5628 const unsigned char *end = p + size;
5629 Py_ssize_t length = 0;
5630
5631 if (size < 0)
5632 return -1;
5633
5634 for (; p < end; ++p) {
5635 if (*p > 127) {
5636 /* Non-ASCII */
5637 return -1;
5638 }
5639 else if (*p != '\\') {
5640 /* Normal character */
5641 ++length;
5642 }
5643 else {
5644 /* Backslash-escape, check next char */
5645 ++p;
5646 /* Escape sequence reaches till end of string or
5647 non-ASCII follow-up. */
5648 if (p >= end || *p > 127)
5649 return -1;
5650 switch (*p) {
5651 case '\n':
5652 /* backslash + \n result in zero characters */
5653 break;
5654 case '\\': case '\'': case '\"':
5655 case 'b': case 'f': case 't':
5656 case 'n': case 'r': case 'v': case 'a':
5657 ++length;
5658 break;
5659 case '0': case '1': case '2': case '3':
5660 case '4': case '5': case '6': case '7':
5661 case 'x': case 'u': case 'U': case 'N':
5662 /* these do not guarantee ASCII characters */
5663 return -1;
5664 default:
5665 /* count the backslash + the other character */
5666 length += 2;
5667 }
5668 }
5669 }
5670 return length;
5671}
5672
Fredrik Lundh06d12682001-01-24 07:59:11 +00005673static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005674
Alexander Belopolsky40018472011-02-26 01:02:56 +00005675PyObject *
5676PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005677 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005678 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005679{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005680 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005681 Py_ssize_t startinpos;
5682 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005683 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005684 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005685 char* message;
5686 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005687 PyObject *errorHandler = NULL;
5688 PyObject *exc = NULL;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005689 Py_ssize_t len;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005690
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005691 len = length_of_escaped_ascii_string(s, size);
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005692 if (len == 0)
5693 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005694
5695 /* After length_of_escaped_ascii_string() there are two alternatives,
5696 either the string is pure ASCII with named escapes like \n, etc.
5697 and we determined it's exact size (common case)
5698 or it contains \x, \u, ... escape sequences. then we create a
5699 legacy wchar string and resize it at the end of this function. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005700 _PyUnicodeWriter_Init(&writer);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005701 if (len > 0) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02005702 writer.min_length = len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005703 }
5704 else {
5705 /* Escaped strings will always be longer than the resulting
5706 Unicode string, so we start with size here and then reduce the
5707 length after conversion to the true value.
5708 (but if the error callback returns a long replacement string
5709 we'll have to allocate more space) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005710 writer.min_length = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005711 }
5712
Guido van Rossumd57fd912000-03-10 22:53:23 +00005713 if (size == 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005714 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005715 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005716
Guido van Rossumd57fd912000-03-10 22:53:23 +00005717 while (s < end) {
5718 unsigned char c;
Victor Stinner24729f32011-11-10 20:31:37 +01005719 Py_UCS4 x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005720 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005721
5722 /* Non-escape characters are interpreted as Unicode ordinals */
5723 if (*s != '\\') {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005724 x = (unsigned char)*s;
5725 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005726 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005727 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005728 continue;
5729 }
5730
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005731 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005732 /* \ - Escapes */
5733 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005734 c = *s++;
5735 if (s > end)
5736 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005737
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005738 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005739
Benjamin Peterson29060642009-01-31 22:14:21 +00005740 /* \x escapes */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005741#define WRITECHAR(ch) \
5742 do { \
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005743 if (_PyUnicodeWriter_WriteCharInline(&writer, (ch)) < 0) \
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005744 goto onError; \
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005745 } while(0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005746
Guido van Rossumd57fd912000-03-10 22:53:23 +00005747 case '\n': break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005748 case '\\': WRITECHAR('\\'); break;
5749 case '\'': WRITECHAR('\''); break;
5750 case '\"': WRITECHAR('\"'); break;
5751 case 'b': WRITECHAR('\b'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005752 /* FF */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005753 case 'f': WRITECHAR('\014'); break;
5754 case 't': WRITECHAR('\t'); break;
5755 case 'n': WRITECHAR('\n'); break;
5756 case 'r': WRITECHAR('\r'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005757 /* VT */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005758 case 'v': WRITECHAR('\013'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005759 /* BEL, not classic C */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005760 case 'a': WRITECHAR('\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005761
Benjamin Peterson29060642009-01-31 22:14:21 +00005762 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005763 case '0': case '1': case '2': case '3':
5764 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005765 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005766 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005767 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005768 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005769 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005770 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005771 WRITECHAR(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005772 break;
5773
Benjamin Peterson29060642009-01-31 22:14:21 +00005774 /* hex escapes */
5775 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005776 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005777 digits = 2;
5778 message = "truncated \\xXX escape";
5779 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005780
Benjamin Peterson29060642009-01-31 22:14:21 +00005781 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005782 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005783 digits = 4;
5784 message = "truncated \\uXXXX escape";
5785 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005786
Benjamin Peterson29060642009-01-31 22:14:21 +00005787 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005788 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005789 digits = 8;
5790 message = "truncated \\UXXXXXXXX escape";
5791 hexescape:
5792 chr = 0;
Serhiy Storchakad6793772013-01-29 10:20:44 +02005793 if (end - s < digits) {
5794 /* count only hex digits */
5795 for (; s < end; ++s) {
5796 c = (unsigned char)*s;
5797 if (!Py_ISXDIGIT(c))
5798 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005799 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02005800 goto error;
5801 }
5802 for (; digits--; ++s) {
5803 c = (unsigned char)*s;
5804 if (!Py_ISXDIGIT(c))
5805 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005806 chr = (chr<<4) & ~0xF;
5807 if (c >= '0' && c <= '9')
5808 chr += c - '0';
5809 else if (c >= 'a' && c <= 'f')
5810 chr += 10 + c - 'a';
5811 else
5812 chr += 10 + c - 'A';
5813 }
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005814 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005815 /* _decoding_error will have already written into the
5816 target buffer. */
5817 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005818 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005819 /* when we get here, chr is a 32-bit unicode character */
Serhiy Storchaka24193de2013-01-29 10:28:07 +02005820 message = "illegal Unicode character";
5821 if (chr > MAX_UNICODE)
Serhiy Storchakad6793772013-01-29 10:20:44 +02005822 goto error;
Serhiy Storchaka24193de2013-01-29 10:28:07 +02005823 WRITECHAR(chr);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005824 break;
5825
Benjamin Peterson29060642009-01-31 22:14:21 +00005826 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005827 case 'N':
5828 message = "malformed \\N character escape";
5829 if (ucnhash_CAPI == NULL) {
5830 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005831 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5832 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005833 if (ucnhash_CAPI == NULL)
5834 goto ucnhashError;
5835 }
5836 if (*s == '{') {
5837 const char *start = s+1;
5838 /* look for the closing brace */
5839 while (*s != '}' && s < end)
5840 s++;
5841 if (s > start && s < end && *s == '}') {
5842 /* found a name. look it up in the unicode database */
5843 message = "unknown Unicode character name";
5844 s++;
Serhiy Storchaka4f5f0e52013-01-21 11:38:00 +02005845 if (s - start - 1 <= INT_MAX &&
Serhiy Storchakac35f3a92013-01-21 11:42:57 +02005846 ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
Ezio Melotti931b8aa2011-10-21 21:57:36 +03005847 &chr, 0))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005848 goto store;
5849 }
5850 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02005851 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005852
5853 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005854 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005855 message = "\\ at end of string";
5856 s--;
Serhiy Storchakad6793772013-01-29 10:20:44 +02005857 goto error;
Walter Dörwald8c077222002-03-25 11:16:18 +00005858 }
5859 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005860 WRITECHAR('\\');
Serhiy Storchaka73e38802013-01-25 23:52:21 +02005861 WRITECHAR((unsigned char)s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005862 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005863 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005864 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02005865 continue;
5866
5867 error:
5868 endinpos = s-starts;
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02005869 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchakad6793772013-01-29 10:20:44 +02005870 errors, &errorHandler,
5871 "unicodeescape", message,
5872 &starts, &end, &startinpos, &endinpos, &exc, &s,
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02005873 &writer))
Serhiy Storchakad6793772013-01-29 10:20:44 +02005874 goto onError;
5875 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005876 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005877#undef WRITECHAR
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005878
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005879 Py_XDECREF(errorHandler);
5880 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005881 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald8c077222002-03-25 11:16:18 +00005882
Benjamin Peterson29060642009-01-31 22:14:21 +00005883 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005884 PyErr_SetString(
5885 PyExc_UnicodeError,
5886 "\\N escapes not supported (can't load unicodedata module)"
5887 );
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005888 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005889 Py_XDECREF(errorHandler);
5890 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005891 return NULL;
5892
Benjamin Peterson29060642009-01-31 22:14:21 +00005893 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005894 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005895 Py_XDECREF(errorHandler);
5896 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005897 return NULL;
5898}
5899
5900/* Return a Unicode-Escape string version of the Unicode object.
5901
5902 If quotes is true, the string is enclosed in u"" or u'' quotes as
5903 appropriate.
5904
5905*/
5906
Alexander Belopolsky40018472011-02-26 01:02:56 +00005907PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005908PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005909{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005910 Py_ssize_t i, len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005911 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005912 char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005913 int kind;
5914 void *data;
5915 Py_ssize_t expandsize = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005916
Ezio Melottie7f90372012-10-05 03:33:31 +03005917 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00005918 escape.
5919
Ezio Melottie7f90372012-10-05 03:33:31 +03005920 For UCS1 strings it's '\xxx', 4 bytes per source character.
5921 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
5922 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00005923 */
5924
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005925 if (!PyUnicode_Check(unicode)) {
5926 PyErr_BadArgument();
5927 return NULL;
5928 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005929 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005930 return NULL;
5931 len = PyUnicode_GET_LENGTH(unicode);
5932 kind = PyUnicode_KIND(unicode);
5933 data = PyUnicode_DATA(unicode);
Benjamin Petersonead6b532011-12-20 17:23:42 -06005934 switch (kind) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005935 case PyUnicode_1BYTE_KIND: expandsize = 4; break;
5936 case PyUnicode_2BYTE_KIND: expandsize = 6; break;
5937 case PyUnicode_4BYTE_KIND: expandsize = 10; break;
5938 }
5939
5940 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005941 return PyBytes_FromStringAndSize(NULL, 0);
5942
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005943 if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005944 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005945
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005946 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005947 2
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005948 + expandsize*len
Benjamin Peterson29060642009-01-31 22:14:21 +00005949 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005950 if (repr == NULL)
5951 return NULL;
5952
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005953 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005954
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005955 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01005956 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005957
Walter Dörwald79e913e2007-05-12 11:08:06 +00005958 /* Escape backslashes */
5959 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005960 *p++ = '\\';
5961 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005962 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005963 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005964
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005965 /* Map 21-bit characters to '\U00xxxxxx' */
5966 else if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01005967 assert(ch <= MAX_UNICODE);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005968 *p++ = '\\';
5969 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005970 *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
5971 *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F];
5972 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
5973 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
5974 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
5975 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
5976 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
5977 *p++ = Py_hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00005978 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005979 }
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005980
Guido van Rossumd57fd912000-03-10 22:53:23 +00005981 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005982 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005983 *p++ = '\\';
5984 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005985 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
5986 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
5987 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5988 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005989 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005990
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005991 /* Map special whitespace to '\t', \n', '\r' */
5992 else if (ch == '\t') {
5993 *p++ = '\\';
5994 *p++ = 't';
5995 }
5996 else if (ch == '\n') {
5997 *p++ = '\\';
5998 *p++ = 'n';
5999 }
6000 else if (ch == '\r') {
6001 *p++ = '\\';
6002 *p++ = 'r';
6003 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006004
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006005 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00006006 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006007 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006008 *p++ = 'x';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006009 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6010 *p++ = Py_hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00006011 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006012
Guido van Rossumd57fd912000-03-10 22:53:23 +00006013 /* Copy everything else as-is */
6014 else
6015 *p++ = (char) ch;
6016 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006017
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006018 assert(p - PyBytes_AS_STRING(repr) > 0);
6019 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
6020 return NULL;
6021 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006022}
6023
Alexander Belopolsky40018472011-02-26 01:02:56 +00006024PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006025PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6026 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006027{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006028 PyObject *result;
6029 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6030 if (tmp == NULL)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006031 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006032 result = PyUnicode_AsUnicodeEscapeString(tmp);
6033 Py_DECREF(tmp);
6034 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006035}
6036
6037/* --- Raw Unicode Escape Codec ------------------------------------------- */
6038
Alexander Belopolsky40018472011-02-26 01:02:56 +00006039PyObject *
6040PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006041 Py_ssize_t size,
6042 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006043{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006044 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006045 Py_ssize_t startinpos;
6046 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006047 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006048 const char *end;
6049 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006050 PyObject *errorHandler = NULL;
6051 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006052
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006053 if (size == 0)
6054 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006055
Guido van Rossumd57fd912000-03-10 22:53:23 +00006056 /* Escaped strings will always be longer than the resulting
6057 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006058 length after conversion to the true value. (But decoding error
6059 handler might have to resize the string) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006060 _PyUnicodeWriter_Init(&writer);
6061 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006062
Guido van Rossumd57fd912000-03-10 22:53:23 +00006063 end = s + size;
6064 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006065 unsigned char c;
6066 Py_UCS4 x;
6067 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006068 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006069
Benjamin Peterson29060642009-01-31 22:14:21 +00006070 /* Non-escape characters are interpreted as Unicode ordinals */
6071 if (*s != '\\') {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006072 x = (unsigned char)*s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006073 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006074 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006075 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006076 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006077 startinpos = s-starts;
6078
6079 /* \u-escapes are only interpreted iff the number of leading
6080 backslashes if odd */
6081 bs = s;
6082 for (;s < end;) {
6083 if (*s != '\\')
6084 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006085 x = (unsigned char)*s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006086 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006087 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006088 }
6089 if (((s - bs) & 1) == 0 ||
6090 s >= end ||
6091 (*s != 'u' && *s != 'U')) {
6092 continue;
6093 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006094 writer.pos--;
Benjamin Peterson29060642009-01-31 22:14:21 +00006095 count = *s=='u' ? 4 : 8;
6096 s++;
6097
6098 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Benjamin Peterson29060642009-01-31 22:14:21 +00006099 for (x = 0, i = 0; i < count; ++i, ++s) {
6100 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00006101 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006102 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006103 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00006104 errors, &errorHandler,
6105 "rawunicodeescape", "truncated \\uXXXX",
6106 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006107 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006108 goto onError;
6109 goto nextByte;
6110 }
6111 x = (x<<4) & ~0xF;
6112 if (c >= '0' && c <= '9')
6113 x += c - '0';
6114 else if (c >= 'a' && c <= 'f')
6115 x += 10 + c - 'a';
6116 else
6117 x += 10 + c - 'A';
6118 }
Victor Stinner8faf8212011-12-08 22:14:11 +01006119 if (x <= MAX_UNICODE) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006120 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006121 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006122 }
6123 else {
Christian Heimesfe337bf2008-03-23 21:54:12 +00006124 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006125 if (unicode_decode_call_errorhandler_writer(
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006126 errors, &errorHandler,
6127 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00006128 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006129 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006130 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006131 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006132 nextByte:
6133 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006134 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006135 Py_XDECREF(errorHandler);
6136 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006137 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006138
Benjamin Peterson29060642009-01-31 22:14:21 +00006139 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006140 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006141 Py_XDECREF(errorHandler);
6142 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006143 return NULL;
6144}
6145
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006146
Alexander Belopolsky40018472011-02-26 01:02:56 +00006147PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006148PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006149{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006150 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006151 char *p;
6152 char *q;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006153 Py_ssize_t expandsize, pos;
6154 int kind;
6155 void *data;
6156 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006157
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006158 if (!PyUnicode_Check(unicode)) {
6159 PyErr_BadArgument();
6160 return NULL;
6161 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05006162 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006163 return NULL;
6164 kind = PyUnicode_KIND(unicode);
6165 data = PyUnicode_DATA(unicode);
6166 len = PyUnicode_GET_LENGTH(unicode);
Benjamin Peterson1518e872011-11-23 10:44:52 -06006167 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6168 bytes, and 1 byte characters 4. */
6169 expandsize = kind * 2 + 2;
Victor Stinner0e368262011-11-10 20:12:49 +01006170
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006171 if (len > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006172 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00006173
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006174 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006175 if (repr == NULL)
6176 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006177 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006178 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006179
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006180 p = q = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006181 for (pos = 0; pos < len; pos++) {
6182 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006183 /* Map 32-bit characters to '\Uxxxxxxxx' */
6184 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01006185 assert(ch <= MAX_UNICODE);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006186 *p++ = '\\';
6187 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006188 *p++ = Py_hexdigits[(ch >> 28) & 0xf];
6189 *p++ = Py_hexdigits[(ch >> 24) & 0xf];
6190 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6191 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6192 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6193 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6194 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6195 *p++ = Py_hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00006196 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006197 /* Map 16-bit characters to '\uxxxx' */
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006198 else if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006199 *p++ = '\\';
6200 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006201 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6202 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6203 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6204 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006205 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006206 /* Copy everything else as-is */
6207 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00006208 *p++ = (char) ch;
6209 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006210
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006211 assert(p > q);
6212 if (_PyBytes_Resize(&repr, p - q) < 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006213 return NULL;
6214 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006215}
6216
Alexander Belopolsky40018472011-02-26 01:02:56 +00006217PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006218PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6219 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006220{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006221 PyObject *result;
6222 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6223 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006224 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006225 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6226 Py_DECREF(tmp);
6227 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006228}
6229
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006230/* --- Unicode Internal Codec ------------------------------------------- */
6231
Alexander Belopolsky40018472011-02-26 01:02:56 +00006232PyObject *
6233_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006234 Py_ssize_t size,
6235 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006236{
6237 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006238 Py_ssize_t startinpos;
6239 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006240 _PyUnicodeWriter writer;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006241 const char *end;
6242 const char *reason;
6243 PyObject *errorHandler = NULL;
6244 PyObject *exc = NULL;
6245
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006246 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02006247 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006248 1))
6249 return NULL;
6250
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006251 if (size == 0)
6252 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006253
Victor Stinner8f674cc2013-04-17 23:02:17 +02006254 _PyUnicodeWriter_Init(&writer);
6255 if (size / Py_UNICODE_SIZE > PY_SSIZE_T_MAX - 1) {
6256 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00006257 goto onError;
Victor Stinner8f674cc2013-04-17 23:02:17 +02006258 }
6259 writer.min_length = (size + (Py_UNICODE_SIZE - 1)) / Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006260
Victor Stinner8f674cc2013-04-17 23:02:17 +02006261 end = s + size;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006262 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006263 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006264 Py_UCS4 ch;
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006265 if (end - s < Py_UNICODE_SIZE) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006266 endinpos = end-starts;
6267 reason = "truncated input";
6268 goto error;
6269 }
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006270 /* We copy the raw representation one byte at a time because the
6271 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006272 ((char *) &uch)[0] = s[0];
6273 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006274#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006275 ((char *) &uch)[2] = s[2];
6276 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006277#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006278 ch = uch;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006279#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006280 /* We have to sanity check the raw data, otherwise doom looms for
6281 some malformed UCS-4 data. */
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006282 if (ch > 0x10ffff) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006283 endinpos = s - starts + Py_UNICODE_SIZE;
6284 reason = "illegal code point (> 0x10FFFF)";
6285 goto error;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006286 }
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006287#endif
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006288 s += Py_UNICODE_SIZE;
6289#ifndef Py_UNICODE_WIDE
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006290 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && end - s >= Py_UNICODE_SIZE)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006291 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006292 Py_UNICODE uch2;
6293 ((char *) &uch2)[0] = s[0];
6294 ((char *) &uch2)[1] = s[1];
Victor Stinner551ac952011-11-29 22:58:13 +01006295 if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006296 {
Victor Stinner551ac952011-11-29 22:58:13 +01006297 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006298 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006299 }
6300 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006301#endif
6302
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006303 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006304 goto onError;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006305 continue;
6306
6307 error:
6308 startinpos = s - starts;
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006309 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006310 errors, &errorHandler,
6311 "unicode_internal", reason,
6312 &starts, &end, &startinpos, &endinpos, &exc, &s,
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006313 &writer))
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006314 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006315 }
6316
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006317 Py_XDECREF(errorHandler);
6318 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006319 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006320
Benjamin Peterson29060642009-01-31 22:14:21 +00006321 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006322 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006323 Py_XDECREF(errorHandler);
6324 Py_XDECREF(exc);
6325 return NULL;
6326}
6327
Guido van Rossumd57fd912000-03-10 22:53:23 +00006328/* --- Latin-1 Codec ------------------------------------------------------ */
6329
Alexander Belopolsky40018472011-02-26 01:02:56 +00006330PyObject *
6331PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006332 Py_ssize_t size,
6333 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006334{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006335 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006336 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006337}
6338
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006339/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006340static void
6341make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006342 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006343 PyObject *unicode,
6344 Py_ssize_t startpos, Py_ssize_t endpos,
6345 const char *reason)
6346{
6347 if (*exceptionObject == NULL) {
6348 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006349 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006350 encoding, unicode, startpos, endpos, reason);
6351 }
6352 else {
6353 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6354 goto onError;
6355 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6356 goto onError;
6357 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6358 goto onError;
6359 return;
6360 onError:
6361 Py_DECREF(*exceptionObject);
6362 *exceptionObject = NULL;
6363 }
6364}
6365
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006366/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006367static void
6368raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006369 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006370 PyObject *unicode,
6371 Py_ssize_t startpos, Py_ssize_t endpos,
6372 const char *reason)
6373{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006374 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006375 encoding, unicode, startpos, endpos, reason);
6376 if (*exceptionObject != NULL)
6377 PyCodec_StrictErrors(*exceptionObject);
6378}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006379
6380/* error handling callback helper:
6381 build arguments, call the callback and check the arguments,
6382 put the result into newpos and return the replacement string, which
6383 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006384static PyObject *
6385unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006386 PyObject **errorHandler,
6387 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006388 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006389 Py_ssize_t startpos, Py_ssize_t endpos,
6390 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006391{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006392 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006393 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006394 PyObject *restuple;
6395 PyObject *resunicode;
6396
6397 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006398 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006399 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006400 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006401 }
6402
Benjamin Petersonbac79492012-01-14 13:34:47 -05006403 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006404 return NULL;
6405 len = PyUnicode_GET_LENGTH(unicode);
6406
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006407 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006408 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006409 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006410 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006411
6412 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006413 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006414 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006415 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006416 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006417 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006418 Py_DECREF(restuple);
6419 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006420 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006421 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006422 &resunicode, newpos)) {
6423 Py_DECREF(restuple);
6424 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006425 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006426 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6427 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6428 Py_DECREF(restuple);
6429 return NULL;
6430 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006431 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006432 *newpos = len + *newpos;
6433 if (*newpos<0 || *newpos>len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006434 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6435 Py_DECREF(restuple);
6436 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006437 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006438 Py_INCREF(resunicode);
6439 Py_DECREF(restuple);
6440 return resunicode;
6441}
6442
Alexander Belopolsky40018472011-02-26 01:02:56 +00006443static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006444unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006445 const char *errors,
Victor Stinnerfcd96532011-11-04 00:28:50 +01006446 unsigned int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006447{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006448 /* input state */
6449 Py_ssize_t pos=0, size;
6450 int kind;
6451 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006452 /* output object */
6453 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006454 /* pointer into the output */
6455 char *str;
6456 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006457 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006458 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6459 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006460 PyObject *errorHandler = NULL;
6461 PyObject *exc = NULL;
6462 /* the following variable is used for caching string comparisons
6463 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6464 int known_errorHandler = -1;
6465
Benjamin Petersonbac79492012-01-14 13:34:47 -05006466 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006467 return NULL;
6468 size = PyUnicode_GET_LENGTH(unicode);
6469 kind = PyUnicode_KIND(unicode);
6470 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006471 /* allocate enough for a simple encoding without
6472 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006473 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006474 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006475 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006476 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006477 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006478 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006479 ressize = size;
6480
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006481 while (pos < size) {
6482 Py_UCS4 c = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006483
Benjamin Peterson29060642009-01-31 22:14:21 +00006484 /* can we encode this? */
6485 if (c<limit) {
6486 /* no overflow check, because we know that the space is enough */
6487 *str++ = (char)c;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006488 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006489 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006490 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006491 Py_ssize_t requiredsize;
6492 PyObject *repunicode;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006493 Py_ssize_t repsize, newpos, respos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006494 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006495 Py_ssize_t collstart = pos;
6496 Py_ssize_t collend = pos;
Benjamin Peterson29060642009-01-31 22:14:21 +00006497 /* find all unecodable characters */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006498 while ((collend < size) && (PyUnicode_READ(kind, data, collend)>=limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006499 ++collend;
6500 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6501 if (known_errorHandler==-1) {
6502 if ((errors==NULL) || (!strcmp(errors, "strict")))
6503 known_errorHandler = 1;
6504 else if (!strcmp(errors, "replace"))
6505 known_errorHandler = 2;
6506 else if (!strcmp(errors, "ignore"))
6507 known_errorHandler = 3;
6508 else if (!strcmp(errors, "xmlcharrefreplace"))
6509 known_errorHandler = 4;
6510 else
6511 known_errorHandler = 0;
6512 }
6513 switch (known_errorHandler) {
6514 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006515 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006516 goto onError;
6517 case 2: /* replace */
6518 while (collstart++<collend)
6519 *str++ = '?'; /* fall through */
6520 case 3: /* ignore */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006521 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006522 break;
6523 case 4: /* xmlcharrefreplace */
6524 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006525 /* determine replacement size */
6526 for (i = collstart, repsize = 0; i < collend; ++i) {
6527 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6528 if (ch < 10)
Benjamin Peterson29060642009-01-31 22:14:21 +00006529 repsize += 2+1+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006530 else if (ch < 100)
Benjamin Peterson29060642009-01-31 22:14:21 +00006531 repsize += 2+2+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006532 else if (ch < 1000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006533 repsize += 2+3+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006534 else if (ch < 10000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006535 repsize += 2+4+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006536 else if (ch < 100000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006537 repsize += 2+5+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006538 else if (ch < 1000000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006539 repsize += 2+6+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006540 else {
Victor Stinner8faf8212011-12-08 22:14:11 +01006541 assert(ch <= MAX_UNICODE);
Benjamin Peterson29060642009-01-31 22:14:21 +00006542 repsize += 2+7+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006543 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006544 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006545 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006546 if (requiredsize > ressize) {
6547 if (requiredsize<2*ressize)
6548 requiredsize = 2*ressize;
6549 if (_PyBytes_Resize(&res, requiredsize))
6550 goto onError;
6551 str = PyBytes_AS_STRING(res) + respos;
6552 ressize = requiredsize;
6553 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006554 /* generate replacement */
6555 for (i = collstart; i < collend; ++i) {
6556 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
Benjamin Peterson29060642009-01-31 22:14:21 +00006557 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006558 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006559 break;
6560 default:
6561 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006562 encoding, reason, unicode, &exc,
6563 collstart, collend, &newpos);
6564 if (repunicode == NULL || (PyUnicode_Check(repunicode) &&
Benjamin Petersonbac79492012-01-14 13:34:47 -05006565 PyUnicode_READY(repunicode) == -1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006566 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006567 if (PyBytes_Check(repunicode)) {
6568 /* Directly copy bytes result to output. */
6569 repsize = PyBytes_Size(repunicode);
6570 if (repsize > 1) {
6571 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006572 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006573 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6574 Py_DECREF(repunicode);
6575 goto onError;
6576 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006577 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006578 ressize += repsize-1;
6579 }
6580 memcpy(str, PyBytes_AsString(repunicode), repsize);
6581 str += repsize;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006582 pos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006583 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006584 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006585 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006586 /* need more space? (at least enough for what we
6587 have+the replacement+the rest of the string, so
6588 we won't have to check space for encodable characters) */
6589 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006590 repsize = PyUnicode_GET_LENGTH(repunicode);
6591 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006592 if (requiredsize > ressize) {
6593 if (requiredsize<2*ressize)
6594 requiredsize = 2*ressize;
6595 if (_PyBytes_Resize(&res, requiredsize)) {
6596 Py_DECREF(repunicode);
6597 goto onError;
6598 }
6599 str = PyBytes_AS_STRING(res) + respos;
6600 ressize = requiredsize;
6601 }
6602 /* check if there is anything unencodable in the replacement
6603 and copy it to the output */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006604 for (i = 0; repsize-->0; ++i, ++str) {
6605 c = PyUnicode_READ_CHAR(repunicode, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00006606 if (c >= limit) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006607 raise_encode_exception(&exc, encoding, unicode,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006608 pos, pos+1, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006609 Py_DECREF(repunicode);
6610 goto onError;
6611 }
6612 *str = (char)c;
6613 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006614 pos = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006615 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006616 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006617 }
6618 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006619 /* Resize if we allocated to much */
6620 size = str - PyBytes_AS_STRING(res);
6621 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006622 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006623 if (_PyBytes_Resize(&res, size) < 0)
6624 goto onError;
6625 }
6626
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006627 Py_XDECREF(errorHandler);
6628 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006629 return res;
6630
6631 onError:
6632 Py_XDECREF(res);
6633 Py_XDECREF(errorHandler);
6634 Py_XDECREF(exc);
6635 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006636}
6637
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006638/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006639PyObject *
6640PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006641 Py_ssize_t size,
6642 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006643{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006644 PyObject *result;
6645 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6646 if (unicode == NULL)
6647 return NULL;
6648 result = unicode_encode_ucs1(unicode, errors, 256);
6649 Py_DECREF(unicode);
6650 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006651}
6652
Alexander Belopolsky40018472011-02-26 01:02:56 +00006653PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006654_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006655{
6656 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006657 PyErr_BadArgument();
6658 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006659 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006660 if (PyUnicode_READY(unicode) == -1)
6661 return NULL;
6662 /* Fast path: if it is a one-byte string, construct
6663 bytes object directly. */
6664 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6665 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6666 PyUnicode_GET_LENGTH(unicode));
6667 /* Non-Latin-1 characters present. Defer to above function to
6668 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006669 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006670}
6671
6672PyObject*
6673PyUnicode_AsLatin1String(PyObject *unicode)
6674{
6675 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006676}
6677
6678/* --- 7-bit ASCII Codec -------------------------------------------------- */
6679
Alexander Belopolsky40018472011-02-26 01:02:56 +00006680PyObject *
6681PyUnicode_DecodeASCII(const char *s,
6682 Py_ssize_t size,
6683 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006684{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006685 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006686 _PyUnicodeWriter writer;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006687 int kind;
6688 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006689 Py_ssize_t startinpos;
6690 Py_ssize_t endinpos;
6691 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006692 const char *e;
6693 PyObject *errorHandler = NULL;
6694 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006695
Guido van Rossumd57fd912000-03-10 22:53:23 +00006696 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02006697 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006698
Guido van Rossumd57fd912000-03-10 22:53:23 +00006699 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006700 if (size == 1 && (unsigned char)s[0] < 128)
6701 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006702
Victor Stinner8f674cc2013-04-17 23:02:17 +02006703 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02006704 writer.min_length = size;
6705 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0)
Victor Stinner8f674cc2013-04-17 23:02:17 +02006706 return NULL;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006707
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006708 e = s + size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006709 data = writer.data;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006710 outpos = ascii_decode(s, e, (Py_UCS1 *)data);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006711 writer.pos = outpos;
6712 if (writer.pos == size)
6713 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006714
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006715 s += writer.pos;
6716 kind = writer.kind;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006717 while (s < e) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02006718 unsigned char c = (unsigned char)*s;
Benjamin Peterson29060642009-01-31 22:14:21 +00006719 if (c < 128) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006720 PyUnicode_WRITE(kind, data, writer.pos, c);
6721 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00006722 ++s;
6723 }
6724 else {
6725 startinpos = s-starts;
6726 endinpos = startinpos + 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006727 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00006728 errors, &errorHandler,
6729 "ascii", "ordinal not in range(128)",
6730 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006731 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006732 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006733 kind = writer.kind;
6734 data = writer.data;
Benjamin Peterson29060642009-01-31 22:14:21 +00006735 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006736 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006737 Py_XDECREF(errorHandler);
6738 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006739 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006740
Benjamin Peterson29060642009-01-31 22:14:21 +00006741 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006742 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006743 Py_XDECREF(errorHandler);
6744 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006745 return NULL;
6746}
6747
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006748/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006749PyObject *
6750PyUnicode_EncodeASCII(const Py_UNICODE *p,
6751 Py_ssize_t size,
6752 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006753{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006754 PyObject *result;
6755 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6756 if (unicode == NULL)
6757 return NULL;
6758 result = unicode_encode_ucs1(unicode, errors, 128);
6759 Py_DECREF(unicode);
6760 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006761}
6762
Alexander Belopolsky40018472011-02-26 01:02:56 +00006763PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006764_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006765{
6766 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006767 PyErr_BadArgument();
6768 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006769 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006770 if (PyUnicode_READY(unicode) == -1)
6771 return NULL;
6772 /* Fast path: if it is an ASCII-only string, construct bytes object
6773 directly. Else defer to above function to raise the exception. */
Victor Stinneraf037572013-04-14 18:44:10 +02006774 if (PyUnicode_IS_ASCII(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006775 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6776 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006777 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006778}
6779
6780PyObject *
6781PyUnicode_AsASCIIString(PyObject *unicode)
6782{
6783 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006784}
6785
Victor Stinner99b95382011-07-04 14:23:54 +02006786#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006787
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006788/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006789
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006790#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006791#define NEED_RETRY
6792#endif
6793
Victor Stinner3a50e702011-10-18 21:21:00 +02006794#ifndef WC_ERR_INVALID_CHARS
6795# define WC_ERR_INVALID_CHARS 0x0080
6796#endif
6797
6798static char*
6799code_page_name(UINT code_page, PyObject **obj)
6800{
6801 *obj = NULL;
6802 if (code_page == CP_ACP)
6803 return "mbcs";
6804 if (code_page == CP_UTF7)
6805 return "CP_UTF7";
6806 if (code_page == CP_UTF8)
6807 return "CP_UTF8";
6808
6809 *obj = PyBytes_FromFormat("cp%u", code_page);
6810 if (*obj == NULL)
6811 return NULL;
6812 return PyBytes_AS_STRING(*obj);
6813}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006814
Alexander Belopolsky40018472011-02-26 01:02:56 +00006815static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006816is_dbcs_lead_byte(UINT code_page, const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006817{
6818 const char *curr = s + offset;
Victor Stinner3a50e702011-10-18 21:21:00 +02006819 const char *prev;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006820
Victor Stinner3a50e702011-10-18 21:21:00 +02006821 if (!IsDBCSLeadByteEx(code_page, *curr))
6822 return 0;
6823
6824 prev = CharPrevExA(code_page, s, curr, 0);
6825 if (prev == curr)
6826 return 1;
6827 /* FIXME: This code is limited to "true" double-byte encodings,
6828 as it assumes an incomplete character consists of a single
6829 byte. */
6830 if (curr - prev == 2)
6831 return 1;
6832 if (!IsDBCSLeadByteEx(code_page, *prev))
6833 return 1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006834 return 0;
6835}
6836
Victor Stinner3a50e702011-10-18 21:21:00 +02006837static DWORD
6838decode_code_page_flags(UINT code_page)
6839{
6840 if (code_page == CP_UTF7) {
6841 /* The CP_UTF7 decoder only supports flags=0 */
6842 return 0;
6843 }
6844 else
6845 return MB_ERR_INVALID_CHARS;
6846}
6847
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006848/*
Victor Stinner3a50e702011-10-18 21:21:00 +02006849 * Decode a byte string from a Windows code page into unicode object in strict
6850 * mode.
6851 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02006852 * Returns consumed size if succeed, returns -2 on decode error, or raise an
6853 * OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006854 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006855static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006856decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006857 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02006858 const char *in,
6859 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006860{
Victor Stinner3a50e702011-10-18 21:21:00 +02006861 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01006862 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02006863 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006864
6865 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02006866 assert(insize > 0);
6867 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
6868 if (outsize <= 0)
6869 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006870
6871 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006872 /* Create unicode object */
Victor Stinnerab595942011-12-17 04:59:06 +01006873 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01006874 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00006875 if (*v == NULL)
6876 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006877 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006878 }
6879 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006880 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02006881 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner16e6a802011-12-12 13:24:15 +01006882 if (unicode_resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006883 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006884 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006885 }
6886
6887 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02006888 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
6889 if (outsize <= 0)
6890 goto error;
6891 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00006892
Victor Stinner3a50e702011-10-18 21:21:00 +02006893error:
6894 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
6895 return -2;
6896 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00006897 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006898}
6899
Victor Stinner3a50e702011-10-18 21:21:00 +02006900/*
6901 * Decode a byte string from a code page into unicode object with an error
6902 * handler.
6903 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02006904 * Returns consumed size if succeed, or raise an OSError or
Victor Stinner3a50e702011-10-18 21:21:00 +02006905 * UnicodeDecodeError exception and returns -1 on error.
6906 */
6907static int
6908decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006909 PyObject **v,
6910 const char *in, const int size,
Victor Stinner3a50e702011-10-18 21:21:00 +02006911 const char *errors)
6912{
6913 const char *startin = in;
6914 const char *endin = in + size;
6915 const DWORD flags = decode_code_page_flags(code_page);
6916 /* Ideally, we should get reason from FormatMessage. This is the Windows
6917 2000 English version of the message. */
6918 const char *reason = "No mapping for the Unicode character exists "
6919 "in the target code page.";
6920 /* each step cannot decode more than 1 character, but a character can be
6921 represented as a surrogate pair */
6922 wchar_t buffer[2], *startout, *out;
Victor Stinner9f067f42013-06-05 00:21:31 +02006923 int insize;
6924 Py_ssize_t outsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02006925 PyObject *errorHandler = NULL;
6926 PyObject *exc = NULL;
6927 PyObject *encoding_obj = NULL;
6928 char *encoding;
6929 DWORD err;
6930 int ret = -1;
6931
6932 assert(size > 0);
6933
6934 encoding = code_page_name(code_page, &encoding_obj);
6935 if (encoding == NULL)
6936 return -1;
6937
6938 if (errors == NULL || strcmp(errors, "strict") == 0) {
6939 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
6940 UnicodeDecodeError. */
6941 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
6942 if (exc != NULL) {
6943 PyCodec_StrictErrors(exc);
6944 Py_CLEAR(exc);
6945 }
6946 goto error;
6947 }
6948
6949 if (*v == NULL) {
6950 /* Create unicode object */
6951 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6952 PyErr_NoMemory();
6953 goto error;
6954 }
Victor Stinnerab595942011-12-17 04:59:06 +01006955 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01006956 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02006957 if (*v == NULL)
6958 goto error;
6959 startout = PyUnicode_AS_UNICODE(*v);
6960 }
6961 else {
6962 /* Extend unicode object */
6963 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
6964 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6965 PyErr_NoMemory();
6966 goto error;
6967 }
Victor Stinner16e6a802011-12-12 13:24:15 +01006968 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02006969 goto error;
6970 startout = PyUnicode_AS_UNICODE(*v) + n;
6971 }
6972
6973 /* Decode the byte string character per character */
6974 out = startout;
6975 while (in < endin)
6976 {
6977 /* Decode a character */
6978 insize = 1;
6979 do
6980 {
6981 outsize = MultiByteToWideChar(code_page, flags,
6982 in, insize,
6983 buffer, Py_ARRAY_LENGTH(buffer));
6984 if (outsize > 0)
6985 break;
6986 err = GetLastError();
6987 if (err != ERROR_NO_UNICODE_TRANSLATION
6988 && err != ERROR_INSUFFICIENT_BUFFER)
6989 {
6990 PyErr_SetFromWindowsErr(0);
6991 goto error;
6992 }
6993 insize++;
6994 }
6995 /* 4=maximum length of a UTF-8 sequence */
6996 while (insize <= 4 && (in + insize) <= endin);
6997
6998 if (outsize <= 0) {
6999 Py_ssize_t startinpos, endinpos, outpos;
7000
7001 startinpos = in - startin;
7002 endinpos = startinpos + 1;
7003 outpos = out - PyUnicode_AS_UNICODE(*v);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007004 if (unicode_decode_call_errorhandler_wchar(
Victor Stinner3a50e702011-10-18 21:21:00 +02007005 errors, &errorHandler,
7006 encoding, reason,
7007 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01007008 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007009 {
7010 goto error;
7011 }
Victor Stinner596a6c42011-11-09 00:02:18 +01007012 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007013 }
7014 else {
7015 in += insize;
7016 memcpy(out, buffer, outsize * sizeof(wchar_t));
7017 out += outsize;
7018 }
7019 }
7020
7021 /* write a NUL character at the end */
7022 *out = 0;
7023
7024 /* Extend unicode object */
7025 outsize = out - startout;
7026 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner16e6a802011-12-12 13:24:15 +01007027 if (unicode_resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007028 goto error;
Victor Stinner76a31a62011-11-04 00:05:13 +01007029 ret = size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007030
7031error:
7032 Py_XDECREF(encoding_obj);
7033 Py_XDECREF(errorHandler);
7034 Py_XDECREF(exc);
7035 return ret;
7036}
7037
Victor Stinner3a50e702011-10-18 21:21:00 +02007038static PyObject *
7039decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007040 const char *s, Py_ssize_t size,
7041 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007042{
Victor Stinner76a31a62011-11-04 00:05:13 +01007043 PyObject *v = NULL;
7044 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007045
Victor Stinner3a50e702011-10-18 21:21:00 +02007046 if (code_page < 0) {
7047 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7048 return NULL;
7049 }
7050
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007051 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007052 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007053
Victor Stinner76a31a62011-11-04 00:05:13 +01007054 do
7055 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007056#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01007057 if (size > INT_MAX) {
7058 chunk_size = INT_MAX;
7059 final = 0;
7060 done = 0;
7061 }
7062 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007063#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007064 {
7065 chunk_size = (int)size;
7066 final = (consumed == NULL);
7067 done = 1;
7068 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007069
Victor Stinner76a31a62011-11-04 00:05:13 +01007070 /* Skip trailing lead-byte unless 'final' is set */
7071 if (!final && is_dbcs_lead_byte(code_page, s, chunk_size - 1))
7072 --chunk_size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007073
Victor Stinner76a31a62011-11-04 00:05:13 +01007074 if (chunk_size == 0 && done) {
7075 if (v != NULL)
7076 break;
Serhiy Storchaka678db842013-01-26 12:16:36 +02007077 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner76a31a62011-11-04 00:05:13 +01007078 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007079
Victor Stinner76a31a62011-11-04 00:05:13 +01007080
7081 converted = decode_code_page_strict(code_page, &v,
7082 s, chunk_size);
7083 if (converted == -2)
7084 converted = decode_code_page_errors(code_page, &v,
7085 s, chunk_size,
7086 errors);
7087 assert(converted != 0);
7088
7089 if (converted < 0) {
7090 Py_XDECREF(v);
7091 return NULL;
7092 }
7093
7094 if (consumed)
7095 *consumed += converted;
7096
7097 s += converted;
7098 size -= converted;
7099 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007100
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007101 return unicode_result(v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007102}
7103
Alexander Belopolsky40018472011-02-26 01:02:56 +00007104PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007105PyUnicode_DecodeCodePageStateful(int code_page,
7106 const char *s,
7107 Py_ssize_t size,
7108 const char *errors,
7109 Py_ssize_t *consumed)
7110{
7111 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7112}
7113
7114PyObject *
7115PyUnicode_DecodeMBCSStateful(const char *s,
7116 Py_ssize_t size,
7117 const char *errors,
7118 Py_ssize_t *consumed)
7119{
7120 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7121}
7122
7123PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007124PyUnicode_DecodeMBCS(const char *s,
7125 Py_ssize_t size,
7126 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007127{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007128 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7129}
7130
Victor Stinner3a50e702011-10-18 21:21:00 +02007131static DWORD
7132encode_code_page_flags(UINT code_page, const char *errors)
7133{
7134 if (code_page == CP_UTF8) {
7135 if (winver.dwMajorVersion >= 6)
7136 /* CP_UTF8 supports WC_ERR_INVALID_CHARS on Windows Vista
7137 and later */
7138 return WC_ERR_INVALID_CHARS;
7139 else
7140 /* CP_UTF8 only supports flags=0 on Windows older than Vista */
7141 return 0;
7142 }
7143 else if (code_page == CP_UTF7) {
7144 /* CP_UTF7 only supports flags=0 */
7145 return 0;
7146 }
7147 else {
7148 if (errors != NULL && strcmp(errors, "replace") == 0)
7149 return 0;
7150 else
7151 return WC_NO_BEST_FIT_CHARS;
7152 }
7153}
7154
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007155/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007156 * Encode a Unicode string to a Windows code page into a byte string in strict
7157 * mode.
7158 *
7159 * Returns consumed characters if succeed, returns -2 on encode error, or raise
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007160 * an OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007161 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007162static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007163encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007164 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007165 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007166{
Victor Stinner554f3f02010-06-16 23:33:54 +00007167 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007168 BOOL *pusedDefaultChar = &usedDefaultChar;
7169 int outsize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007170 PyObject *exc = NULL;
Victor Stinner24729f32011-11-10 20:31:37 +01007171 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007172 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007173 const DWORD flags = encode_code_page_flags(code_page, NULL);
7174 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007175 /* Create a substring so that we can get the UTF-16 representation
7176 of just the slice under consideration. */
7177 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007178
Martin v. Löwis3d325192011-11-04 18:23:06 +01007179 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007180
Victor Stinner3a50e702011-10-18 21:21:00 +02007181 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007182 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007183 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007184 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007185
Victor Stinner2fc507f2011-11-04 20:06:39 +01007186 substring = PyUnicode_Substring(unicode, offset, offset+len);
7187 if (substring == NULL)
7188 return -1;
7189 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7190 if (p == NULL) {
7191 Py_DECREF(substring);
7192 return -1;
7193 }
Victor Stinner9f067f42013-06-05 00:21:31 +02007194 assert(size <= INT_MAX);
Martin v. Löwis3d325192011-11-04 18:23:06 +01007195
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007196 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007197 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007198 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007199 NULL, 0,
7200 NULL, pusedDefaultChar);
7201 if (outsize <= 0)
7202 goto error;
7203 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007204 if (pusedDefaultChar && *pusedDefaultChar) {
7205 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007206 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007207 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007208
Victor Stinner3a50e702011-10-18 21:21:00 +02007209 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007210 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007211 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007212 if (*outbytes == NULL) {
7213 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007214 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007215 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007216 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007217 }
7218 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007219 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007220 const Py_ssize_t n = PyBytes_Size(*outbytes);
7221 if (outsize > PY_SSIZE_T_MAX - n) {
7222 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007223 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007224 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007225 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007226 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7227 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007228 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007229 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007230 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007231 }
7232
7233 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007234 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007235 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007236 out, outsize,
7237 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007238 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007239 if (outsize <= 0)
7240 goto error;
7241 if (pusedDefaultChar && *pusedDefaultChar)
7242 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007243 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007244
Victor Stinner3a50e702011-10-18 21:21:00 +02007245error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007246 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007247 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7248 return -2;
7249 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007250 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007251}
7252
Victor Stinner3a50e702011-10-18 21:21:00 +02007253/*
7254 * Encode a Unicode string to a Windows code page into a byte string using a
7255 * error handler.
7256 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007257 * Returns consumed characters if succeed, or raise an OSError and returns
Victor Stinner3a50e702011-10-18 21:21:00 +02007258 * -1 on other error.
7259 */
7260static int
7261encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007262 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007263 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007264{
Victor Stinner3a50e702011-10-18 21:21:00 +02007265 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007266 Py_ssize_t pos = unicode_offset;
7267 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007268 /* Ideally, we should get reason from FormatMessage. This is the Windows
7269 2000 English version of the message. */
7270 const char *reason = "invalid character";
7271 /* 4=maximum length of a UTF-8 sequence */
7272 char buffer[4];
7273 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7274 Py_ssize_t outsize;
7275 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007276 PyObject *errorHandler = NULL;
7277 PyObject *exc = NULL;
7278 PyObject *encoding_obj = NULL;
7279 char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007280 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007281 PyObject *rep;
7282 int ret = -1;
7283
7284 assert(insize > 0);
7285
7286 encoding = code_page_name(code_page, &encoding_obj);
7287 if (encoding == NULL)
7288 return -1;
7289
7290 if (errors == NULL || strcmp(errors, "strict") == 0) {
7291 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7292 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007293 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007294 if (exc != NULL) {
7295 PyCodec_StrictErrors(exc);
7296 Py_DECREF(exc);
7297 }
7298 Py_XDECREF(encoding_obj);
7299 return -1;
7300 }
7301
7302 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7303 pusedDefaultChar = &usedDefaultChar;
7304 else
7305 pusedDefaultChar = NULL;
7306
7307 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7308 PyErr_NoMemory();
7309 goto error;
7310 }
7311 outsize = insize * Py_ARRAY_LENGTH(buffer);
7312
7313 if (*outbytes == NULL) {
7314 /* Create string object */
7315 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7316 if (*outbytes == NULL)
7317 goto error;
7318 out = PyBytes_AS_STRING(*outbytes);
7319 }
7320 else {
7321 /* Extend string object */
7322 Py_ssize_t n = PyBytes_Size(*outbytes);
7323 if (n > PY_SSIZE_T_MAX - outsize) {
7324 PyErr_NoMemory();
7325 goto error;
7326 }
7327 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7328 goto error;
7329 out = PyBytes_AS_STRING(*outbytes) + n;
7330 }
7331
7332 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007333 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007334 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007335 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7336 wchar_t chars[2];
7337 int charsize;
7338 if (ch < 0x10000) {
7339 chars[0] = (wchar_t)ch;
7340 charsize = 1;
7341 }
7342 else {
Victor Stinner76df43d2012-10-30 01:42:39 +01007343 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7344 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007345 charsize = 2;
7346 }
7347
Victor Stinner3a50e702011-10-18 21:21:00 +02007348 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007349 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007350 buffer, Py_ARRAY_LENGTH(buffer),
7351 NULL, pusedDefaultChar);
7352 if (outsize > 0) {
7353 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7354 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007355 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007356 memcpy(out, buffer, outsize);
7357 out += outsize;
7358 continue;
7359 }
7360 }
7361 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7362 PyErr_SetFromWindowsErr(0);
7363 goto error;
7364 }
7365
Victor Stinner3a50e702011-10-18 21:21:00 +02007366 rep = unicode_encode_call_errorhandler(
7367 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007368 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007369 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007370 if (rep == NULL)
7371 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007372 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007373
7374 if (PyBytes_Check(rep)) {
7375 outsize = PyBytes_GET_SIZE(rep);
7376 if (outsize != 1) {
7377 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7378 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7379 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7380 Py_DECREF(rep);
7381 goto error;
7382 }
7383 out = PyBytes_AS_STRING(*outbytes) + offset;
7384 }
7385 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7386 out += outsize;
7387 }
7388 else {
7389 Py_ssize_t i;
7390 enum PyUnicode_Kind kind;
7391 void *data;
7392
Benjamin Petersonbac79492012-01-14 13:34:47 -05007393 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007394 Py_DECREF(rep);
7395 goto error;
7396 }
7397
7398 outsize = PyUnicode_GET_LENGTH(rep);
7399 if (outsize != 1) {
7400 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7401 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7402 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7403 Py_DECREF(rep);
7404 goto error;
7405 }
7406 out = PyBytes_AS_STRING(*outbytes) + offset;
7407 }
7408 kind = PyUnicode_KIND(rep);
7409 data = PyUnicode_DATA(rep);
7410 for (i=0; i < outsize; i++) {
7411 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7412 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007413 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007414 encoding, unicode,
7415 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007416 "unable to encode error handler result to ASCII");
7417 Py_DECREF(rep);
7418 goto error;
7419 }
7420 *out = (unsigned char)ch;
7421 out++;
7422 }
7423 }
7424 Py_DECREF(rep);
7425 }
7426 /* write a NUL byte */
7427 *out = 0;
7428 outsize = out - PyBytes_AS_STRING(*outbytes);
7429 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7430 if (_PyBytes_Resize(outbytes, outsize) < 0)
7431 goto error;
7432 ret = 0;
7433
7434error:
7435 Py_XDECREF(encoding_obj);
7436 Py_XDECREF(errorHandler);
7437 Py_XDECREF(exc);
7438 return ret;
7439}
7440
Victor Stinner3a50e702011-10-18 21:21:00 +02007441static PyObject *
7442encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007443 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007444 const char *errors)
7445{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007446 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007447 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007448 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007449 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007450
Benjamin Petersonbac79492012-01-14 13:34:47 -05007451 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007452 return NULL;
7453 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007454
Victor Stinner3a50e702011-10-18 21:21:00 +02007455 if (code_page < 0) {
7456 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7457 return NULL;
7458 }
7459
Martin v. Löwis3d325192011-11-04 18:23:06 +01007460 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007461 return PyBytes_FromStringAndSize(NULL, 0);
7462
Victor Stinner7581cef2011-11-03 22:32:33 +01007463 offset = 0;
7464 do
7465 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007466#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007467 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007468 chunks. */
7469 if (len > INT_MAX/2) {
7470 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007471 done = 0;
7472 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007473 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007474#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007475 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007476 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007477 done = 1;
7478 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007479
Victor Stinner76a31a62011-11-04 00:05:13 +01007480 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007481 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007482 errors);
7483 if (ret == -2)
7484 ret = encode_code_page_errors(code_page, &outbytes,
7485 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007486 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007487 if (ret < 0) {
7488 Py_XDECREF(outbytes);
7489 return NULL;
7490 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007491
Victor Stinner7581cef2011-11-03 22:32:33 +01007492 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007493 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007494 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007495
Victor Stinner3a50e702011-10-18 21:21:00 +02007496 return outbytes;
7497}
7498
7499PyObject *
7500PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7501 Py_ssize_t size,
7502 const char *errors)
7503{
Victor Stinner7581cef2011-11-03 22:32:33 +01007504 PyObject *unicode, *res;
7505 unicode = PyUnicode_FromUnicode(p, size);
7506 if (unicode == NULL)
7507 return NULL;
7508 res = encode_code_page(CP_ACP, unicode, errors);
7509 Py_DECREF(unicode);
7510 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007511}
7512
7513PyObject *
7514PyUnicode_EncodeCodePage(int code_page,
7515 PyObject *unicode,
7516 const char *errors)
7517{
Victor Stinner7581cef2011-11-03 22:32:33 +01007518 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007519}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007520
Alexander Belopolsky40018472011-02-26 01:02:56 +00007521PyObject *
7522PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007523{
7524 if (!PyUnicode_Check(unicode)) {
7525 PyErr_BadArgument();
7526 return NULL;
7527 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007528 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007529}
7530
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007531#undef NEED_RETRY
7532
Victor Stinner99b95382011-07-04 14:23:54 +02007533#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007534
Guido van Rossumd57fd912000-03-10 22:53:23 +00007535/* --- Character Mapping Codec -------------------------------------------- */
7536
Victor Stinnerfb161b12013-04-18 01:44:27 +02007537static int
7538charmap_decode_string(const char *s,
7539 Py_ssize_t size,
7540 PyObject *mapping,
7541 const char *errors,
7542 _PyUnicodeWriter *writer)
7543{
7544 const char *starts = s;
7545 const char *e;
7546 Py_ssize_t startinpos, endinpos;
7547 PyObject *errorHandler = NULL, *exc = NULL;
7548 Py_ssize_t maplen;
7549 enum PyUnicode_Kind mapkind;
7550 void *mapdata;
7551 Py_UCS4 x;
7552 unsigned char ch;
7553
7554 if (PyUnicode_READY(mapping) == -1)
7555 return -1;
7556
7557 maplen = PyUnicode_GET_LENGTH(mapping);
7558 mapdata = PyUnicode_DATA(mapping);
7559 mapkind = PyUnicode_KIND(mapping);
7560
7561 e = s + size;
7562
7563 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
7564 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
7565 * is disabled in encoding aliases, latin1 is preferred because
7566 * its implementation is faster. */
7567 Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata;
7568 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7569 Py_UCS4 maxchar = writer->maxchar;
7570
7571 assert (writer->kind == PyUnicode_1BYTE_KIND);
7572 while (s < e) {
7573 ch = *s;
7574 x = mapdata_ucs1[ch];
7575 if (x > maxchar) {
7576 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
7577 goto onError;
7578 maxchar = writer->maxchar;
7579 outdata = (Py_UCS1 *)writer->data;
7580 }
7581 outdata[writer->pos] = x;
7582 writer->pos++;
7583 ++s;
7584 }
7585 return 0;
7586 }
7587
7588 while (s < e) {
7589 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
7590 enum PyUnicode_Kind outkind = writer->kind;
7591 Py_UCS2 *mapdata_ucs2 = (Py_UCS2 *)mapdata;
7592 if (outkind == PyUnicode_1BYTE_KIND) {
7593 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7594 Py_UCS4 maxchar = writer->maxchar;
7595 while (s < e) {
7596 ch = *s;
7597 x = mapdata_ucs2[ch];
7598 if (x > maxchar)
7599 goto Error;
7600 outdata[writer->pos] = x;
7601 writer->pos++;
7602 ++s;
7603 }
7604 break;
7605 }
7606 else if (outkind == PyUnicode_2BYTE_KIND) {
7607 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
7608 while (s < e) {
7609 ch = *s;
7610 x = mapdata_ucs2[ch];
7611 if (x == 0xFFFE)
7612 goto Error;
7613 outdata[writer->pos] = x;
7614 writer->pos++;
7615 ++s;
7616 }
7617 break;
7618 }
7619 }
7620 ch = *s;
7621
7622 if (ch < maplen)
7623 x = PyUnicode_READ(mapkind, mapdata, ch);
7624 else
7625 x = 0xfffe; /* invalid value */
7626Error:
7627 if (x == 0xfffe)
7628 {
7629 /* undefined mapping */
7630 startinpos = s-starts;
7631 endinpos = startinpos+1;
7632 if (unicode_decode_call_errorhandler_writer(
7633 errors, &errorHandler,
7634 "charmap", "character maps to <undefined>",
7635 &starts, &e, &startinpos, &endinpos, &exc, &s,
7636 writer)) {
7637 goto onError;
7638 }
7639 continue;
7640 }
7641
7642 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
7643 goto onError;
7644 ++s;
7645 }
7646 Py_XDECREF(errorHandler);
7647 Py_XDECREF(exc);
7648 return 0;
7649
7650onError:
7651 Py_XDECREF(errorHandler);
7652 Py_XDECREF(exc);
7653 return -1;
7654}
7655
7656static int
7657charmap_decode_mapping(const char *s,
7658 Py_ssize_t size,
7659 PyObject *mapping,
7660 const char *errors,
7661 _PyUnicodeWriter *writer)
7662{
7663 const char *starts = s;
7664 const char *e;
7665 Py_ssize_t startinpos, endinpos;
7666 PyObject *errorHandler = NULL, *exc = NULL;
7667 unsigned char ch;
Victor Stinnerf4f24242013-05-07 01:01:31 +02007668 PyObject *key, *item = NULL;
Victor Stinnerfb161b12013-04-18 01:44:27 +02007669
7670 e = s + size;
7671
7672 while (s < e) {
7673 ch = *s;
7674
7675 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7676 key = PyLong_FromLong((long)ch);
7677 if (key == NULL)
7678 goto onError;
7679
7680 item = PyObject_GetItem(mapping, key);
7681 Py_DECREF(key);
7682 if (item == NULL) {
7683 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7684 /* No mapping found means: mapping is undefined. */
7685 PyErr_Clear();
7686 goto Undefined;
7687 } else
7688 goto onError;
7689 }
7690
7691 /* Apply mapping */
7692 if (item == Py_None)
7693 goto Undefined;
7694 if (PyLong_Check(item)) {
7695 long value = PyLong_AS_LONG(item);
7696 if (value == 0xFFFE)
7697 goto Undefined;
7698 if (value < 0 || value > MAX_UNICODE) {
7699 PyErr_Format(PyExc_TypeError,
7700 "character mapping must be in range(0x%lx)",
7701 (unsigned long)MAX_UNICODE + 1);
7702 goto onError;
7703 }
7704
7705 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7706 goto onError;
7707 }
7708 else if (PyUnicode_Check(item)) {
7709 if (PyUnicode_READY(item) == -1)
7710 goto onError;
7711 if (PyUnicode_GET_LENGTH(item) == 1) {
7712 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
7713 if (value == 0xFFFE)
7714 goto Undefined;
7715 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7716 goto onError;
7717 }
7718 else {
7719 writer->overallocate = 1;
7720 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
7721 goto onError;
7722 }
7723 }
7724 else {
7725 /* wrong return value */
7726 PyErr_SetString(PyExc_TypeError,
7727 "character mapping must return integer, None or str");
7728 goto onError;
7729 }
7730 Py_CLEAR(item);
7731 ++s;
7732 continue;
7733
7734Undefined:
7735 /* undefined mapping */
7736 Py_CLEAR(item);
7737 startinpos = s-starts;
7738 endinpos = startinpos+1;
7739 if (unicode_decode_call_errorhandler_writer(
7740 errors, &errorHandler,
7741 "charmap", "character maps to <undefined>",
7742 &starts, &e, &startinpos, &endinpos, &exc, &s,
7743 writer)) {
7744 goto onError;
7745 }
7746 }
7747 Py_XDECREF(errorHandler);
7748 Py_XDECREF(exc);
7749 return 0;
7750
7751onError:
7752 Py_XDECREF(item);
7753 Py_XDECREF(errorHandler);
7754 Py_XDECREF(exc);
7755 return -1;
7756}
7757
Alexander Belopolsky40018472011-02-26 01:02:56 +00007758PyObject *
7759PyUnicode_DecodeCharmap(const char *s,
7760 Py_ssize_t size,
7761 PyObject *mapping,
7762 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007763{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007764 _PyUnicodeWriter writer;
Tim Petersced69f82003-09-16 20:30:58 +00007765
Guido van Rossumd57fd912000-03-10 22:53:23 +00007766 /* Default to Latin-1 */
7767 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007768 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007769
Guido van Rossumd57fd912000-03-10 22:53:23 +00007770 if (size == 0)
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02007771 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner8f674cc2013-04-17 23:02:17 +02007772 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02007773 writer.min_length = size;
7774 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007775 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007776
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007777 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerfb161b12013-04-18 01:44:27 +02007778 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
7779 goto onError;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007780 }
7781 else {
Victor Stinnerfb161b12013-04-18 01:44:27 +02007782 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
7783 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007784 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007785 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00007786
Benjamin Peterson29060642009-01-31 22:14:21 +00007787 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007788 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007789 return NULL;
7790}
7791
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007792/* Charmap encoding: the lookup table */
7793
Alexander Belopolsky40018472011-02-26 01:02:56 +00007794struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00007795 PyObject_HEAD
7796 unsigned char level1[32];
7797 int count2, count3;
7798 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007799};
7800
7801static PyObject*
7802encoding_map_size(PyObject *obj, PyObject* args)
7803{
7804 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007805 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00007806 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007807}
7808
7809static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007810 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00007811 PyDoc_STR("Return the size (in bytes) of this object") },
7812 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007813};
7814
7815static void
7816encoding_map_dealloc(PyObject* o)
7817{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007818 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007819}
7820
7821static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007822 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007823 "EncodingMap", /*tp_name*/
7824 sizeof(struct encoding_map), /*tp_basicsize*/
7825 0, /*tp_itemsize*/
7826 /* methods */
7827 encoding_map_dealloc, /*tp_dealloc*/
7828 0, /*tp_print*/
7829 0, /*tp_getattr*/
7830 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00007831 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00007832 0, /*tp_repr*/
7833 0, /*tp_as_number*/
7834 0, /*tp_as_sequence*/
7835 0, /*tp_as_mapping*/
7836 0, /*tp_hash*/
7837 0, /*tp_call*/
7838 0, /*tp_str*/
7839 0, /*tp_getattro*/
7840 0, /*tp_setattro*/
7841 0, /*tp_as_buffer*/
7842 Py_TPFLAGS_DEFAULT, /*tp_flags*/
7843 0, /*tp_doc*/
7844 0, /*tp_traverse*/
7845 0, /*tp_clear*/
7846 0, /*tp_richcompare*/
7847 0, /*tp_weaklistoffset*/
7848 0, /*tp_iter*/
7849 0, /*tp_iternext*/
7850 encoding_map_methods, /*tp_methods*/
7851 0, /*tp_members*/
7852 0, /*tp_getset*/
7853 0, /*tp_base*/
7854 0, /*tp_dict*/
7855 0, /*tp_descr_get*/
7856 0, /*tp_descr_set*/
7857 0, /*tp_dictoffset*/
7858 0, /*tp_init*/
7859 0, /*tp_alloc*/
7860 0, /*tp_new*/
7861 0, /*tp_free*/
7862 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007863};
7864
7865PyObject*
7866PyUnicode_BuildEncodingMap(PyObject* string)
7867{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007868 PyObject *result;
7869 struct encoding_map *mresult;
7870 int i;
7871 int need_dict = 0;
7872 unsigned char level1[32];
7873 unsigned char level2[512];
7874 unsigned char *mlevel1, *mlevel2, *mlevel3;
7875 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007876 int kind;
7877 void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007878 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007879 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007880
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007881 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007882 PyErr_BadArgument();
7883 return NULL;
7884 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007885 kind = PyUnicode_KIND(string);
7886 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007887 length = PyUnicode_GET_LENGTH(string);
7888 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007889 memset(level1, 0xFF, sizeof level1);
7890 memset(level2, 0xFF, sizeof level2);
7891
7892 /* If there isn't a one-to-one mapping of NULL to \0,
7893 or if there are non-BMP characters, we need to use
7894 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007895 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007896 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007897 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007898 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007899 ch = PyUnicode_READ(kind, data, i);
7900 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007901 need_dict = 1;
7902 break;
7903 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007904 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007905 /* unmapped character */
7906 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007907 l1 = ch >> 11;
7908 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007909 if (level1[l1] == 0xFF)
7910 level1[l1] = count2++;
7911 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00007912 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007913 }
7914
7915 if (count2 >= 0xFF || count3 >= 0xFF)
7916 need_dict = 1;
7917
7918 if (need_dict) {
7919 PyObject *result = PyDict_New();
7920 PyObject *key, *value;
7921 if (!result)
7922 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007923 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007924 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00007925 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007926 if (!key || !value)
7927 goto failed1;
7928 if (PyDict_SetItem(result, key, value) == -1)
7929 goto failed1;
7930 Py_DECREF(key);
7931 Py_DECREF(value);
7932 }
7933 return result;
7934 failed1:
7935 Py_XDECREF(key);
7936 Py_XDECREF(value);
7937 Py_DECREF(result);
7938 return NULL;
7939 }
7940
7941 /* Create a three-level trie */
7942 result = PyObject_MALLOC(sizeof(struct encoding_map) +
7943 16*count2 + 128*count3 - 1);
7944 if (!result)
7945 return PyErr_NoMemory();
7946 PyObject_Init(result, &EncodingMapType);
7947 mresult = (struct encoding_map*)result;
7948 mresult->count2 = count2;
7949 mresult->count3 = count3;
7950 mlevel1 = mresult->level1;
7951 mlevel2 = mresult->level23;
7952 mlevel3 = mresult->level23 + 16*count2;
7953 memcpy(mlevel1, level1, 32);
7954 memset(mlevel2, 0xFF, 16*count2);
7955 memset(mlevel3, 0, 128*count3);
7956 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007957 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007958 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007959 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7960 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007961 /* unmapped character */
7962 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007963 o1 = ch>>11;
7964 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007965 i2 = 16*mlevel1[o1] + o2;
7966 if (mlevel2[i2] == 0xFF)
7967 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007968 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007969 i3 = 128*mlevel2[i2] + o3;
7970 mlevel3[i3] = i;
7971 }
7972 return result;
7973}
7974
7975static int
Victor Stinner22168992011-11-20 17:09:18 +01007976encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007977{
7978 struct encoding_map *map = (struct encoding_map*)mapping;
7979 int l1 = c>>11;
7980 int l2 = (c>>7) & 0xF;
7981 int l3 = c & 0x7F;
7982 int i;
7983
Victor Stinner22168992011-11-20 17:09:18 +01007984 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00007985 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007986 if (c == 0)
7987 return 0;
7988 /* level 1*/
7989 i = map->level1[l1];
7990 if (i == 0xFF) {
7991 return -1;
7992 }
7993 /* level 2*/
7994 i = map->level23[16*i+l2];
7995 if (i == 0xFF) {
7996 return -1;
7997 }
7998 /* level 3 */
7999 i = map->level23[16*map->count2 + 128*i + l3];
8000 if (i == 0) {
8001 return -1;
8002 }
8003 return i;
8004}
8005
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008006/* Lookup the character ch in the mapping. If the character
8007 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00008008 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008009static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01008010charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008011{
Christian Heimes217cfd12007-12-02 14:31:20 +00008012 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008013 PyObject *x;
8014
8015 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008016 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008017 x = PyObject_GetItem(mapping, w);
8018 Py_DECREF(w);
8019 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008020 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8021 /* No mapping found means: mapping is undefined. */
8022 PyErr_Clear();
8023 x = Py_None;
8024 Py_INCREF(x);
8025 return x;
8026 } else
8027 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008028 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008029 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008030 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008031 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008032 long value = PyLong_AS_LONG(x);
8033 if (value < 0 || value > 255) {
8034 PyErr_SetString(PyExc_TypeError,
8035 "character mapping must be in range(256)");
8036 Py_DECREF(x);
8037 return NULL;
8038 }
8039 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008040 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008041 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008042 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008043 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008044 /* wrong return value */
8045 PyErr_Format(PyExc_TypeError,
8046 "character mapping must return integer, bytes or None, not %.400s",
8047 x->ob_type->tp_name);
8048 Py_DECREF(x);
8049 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008050 }
8051}
8052
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008053static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008054charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008055{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008056 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8057 /* exponentially overallocate to minimize reallocations */
8058 if (requiredsize < 2*outsize)
8059 requiredsize = 2*outsize;
8060 if (_PyBytes_Resize(outobj, requiredsize))
8061 return -1;
8062 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008063}
8064
Benjamin Peterson14339b62009-01-31 16:36:08 +00008065typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008066 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008067} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008068/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008069 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008070 space is available. Return a new reference to the object that
8071 was put in the output buffer, or Py_None, if the mapping was undefined
8072 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008073 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008074static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008075charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008076 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008077{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008078 PyObject *rep;
8079 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008080 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008081
Christian Heimes90aa7642007-12-19 02:45:37 +00008082 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008083 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008084 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008085 if (res == -1)
8086 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008087 if (outsize<requiredsize)
8088 if (charmapencode_resize(outobj, outpos, requiredsize))
8089 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008090 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008091 outstart[(*outpos)++] = (char)res;
8092 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008093 }
8094
8095 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008096 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008097 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008098 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008099 Py_DECREF(rep);
8100 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008101 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008102 if (PyLong_Check(rep)) {
8103 Py_ssize_t requiredsize = *outpos+1;
8104 if (outsize<requiredsize)
8105 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8106 Py_DECREF(rep);
8107 return enc_EXCEPTION;
8108 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008109 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008110 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008111 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008112 else {
8113 const char *repchars = PyBytes_AS_STRING(rep);
8114 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8115 Py_ssize_t requiredsize = *outpos+repsize;
8116 if (outsize<requiredsize)
8117 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8118 Py_DECREF(rep);
8119 return enc_EXCEPTION;
8120 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008121 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008122 memcpy(outstart + *outpos, repchars, repsize);
8123 *outpos += repsize;
8124 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008125 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008126 Py_DECREF(rep);
8127 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008128}
8129
8130/* handle an error in PyUnicode_EncodeCharmap
8131 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008132static int
8133charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008134 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008135 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00008136 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008137 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008138{
8139 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008140 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008141 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008142 enum PyUnicode_Kind kind;
8143 void *data;
8144 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008145 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008146 Py_ssize_t collstartpos = *inpos;
8147 Py_ssize_t collendpos = *inpos+1;
8148 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008149 char *encoding = "charmap";
8150 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008151 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008152 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008153 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008154
Benjamin Petersonbac79492012-01-14 13:34:47 -05008155 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008156 return -1;
8157 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008158 /* find all unencodable characters */
8159 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008160 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008161 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008162 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008163 val = encoding_map_lookup(ch, mapping);
8164 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008165 break;
8166 ++collendpos;
8167 continue;
8168 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008169
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008170 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8171 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008172 if (rep==NULL)
8173 return -1;
8174 else if (rep!=Py_None) {
8175 Py_DECREF(rep);
8176 break;
8177 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008178 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008179 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008180 }
8181 /* cache callback name lookup
8182 * (if not done yet, i.e. it's the first error) */
8183 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008184 if ((errors==NULL) || (!strcmp(errors, "strict")))
8185 *known_errorHandler = 1;
8186 else if (!strcmp(errors, "replace"))
8187 *known_errorHandler = 2;
8188 else if (!strcmp(errors, "ignore"))
8189 *known_errorHandler = 3;
8190 else if (!strcmp(errors, "xmlcharrefreplace"))
8191 *known_errorHandler = 4;
8192 else
8193 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008194 }
8195 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008196 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008197 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008198 return -1;
8199 case 2: /* replace */
8200 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008201 x = charmapencode_output('?', mapping, res, respos);
8202 if (x==enc_EXCEPTION) {
8203 return -1;
8204 }
8205 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008206 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008207 return -1;
8208 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008209 }
8210 /* fall through */
8211 case 3: /* ignore */
8212 *inpos = collendpos;
8213 break;
8214 case 4: /* xmlcharrefreplace */
8215 /* generate replacement (temporarily (mis)uses p) */
8216 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008217 char buffer[2+29+1+1];
8218 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008219 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008220 for (cp = buffer; *cp; ++cp) {
8221 x = charmapencode_output(*cp, mapping, res, respos);
8222 if (x==enc_EXCEPTION)
8223 return -1;
8224 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008225 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008226 return -1;
8227 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008228 }
8229 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008230 *inpos = collendpos;
8231 break;
8232 default:
8233 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008234 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008235 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008236 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008237 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008238 if (PyBytes_Check(repunicode)) {
8239 /* Directly copy bytes result to output. */
8240 Py_ssize_t outsize = PyBytes_Size(*res);
8241 Py_ssize_t requiredsize;
8242 repsize = PyBytes_Size(repunicode);
8243 requiredsize = *respos + repsize;
8244 if (requiredsize > outsize)
8245 /* Make room for all additional bytes. */
8246 if (charmapencode_resize(res, respos, requiredsize)) {
8247 Py_DECREF(repunicode);
8248 return -1;
8249 }
8250 memcpy(PyBytes_AsString(*res) + *respos,
8251 PyBytes_AsString(repunicode), repsize);
8252 *respos += repsize;
8253 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008254 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008255 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008256 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008257 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008258 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008259 Py_DECREF(repunicode);
8260 return -1;
8261 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008262 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008263 data = PyUnicode_DATA(repunicode);
8264 kind = PyUnicode_KIND(repunicode);
8265 for (index = 0; index < repsize; index++) {
8266 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8267 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008268 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008269 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008270 return -1;
8271 }
8272 else if (x==enc_FAILED) {
8273 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008274 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008275 return -1;
8276 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008277 }
8278 *inpos = newpos;
8279 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008280 }
8281 return 0;
8282}
8283
Alexander Belopolsky40018472011-02-26 01:02:56 +00008284PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008285_PyUnicode_EncodeCharmap(PyObject *unicode,
8286 PyObject *mapping,
8287 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008288{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008289 /* output object */
8290 PyObject *res = NULL;
8291 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008292 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008293 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008294 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008295 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008296 PyObject *errorHandler = NULL;
8297 PyObject *exc = NULL;
8298 /* the following variable is used for caching string comparisons
8299 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8300 * 3=ignore, 4=xmlcharrefreplace */
8301 int known_errorHandler = -1;
Victor Stinner69ed0f42013-04-09 21:48:24 +02008302 void *data;
8303 int kind;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008304
Benjamin Petersonbac79492012-01-14 13:34:47 -05008305 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008306 return NULL;
8307 size = PyUnicode_GET_LENGTH(unicode);
Victor Stinner69ed0f42013-04-09 21:48:24 +02008308 data = PyUnicode_DATA(unicode);
8309 kind = PyUnicode_KIND(unicode);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008310
Guido van Rossumd57fd912000-03-10 22:53:23 +00008311 /* Default to Latin-1 */
8312 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008313 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008314
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008315 /* allocate enough for a simple encoding without
8316 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008317 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008318 if (res == NULL)
8319 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008320 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008321 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008322
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008323 while (inpos<size) {
Victor Stinner69ed0f42013-04-09 21:48:24 +02008324 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008325 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008326 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008327 if (x==enc_EXCEPTION) /* error */
8328 goto onError;
8329 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008330 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008331 &exc,
8332 &known_errorHandler, &errorHandler, errors,
8333 &res, &respos)) {
8334 goto onError;
8335 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008336 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008337 else
8338 /* done with this character => adjust input position */
8339 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008340 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008341
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008342 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008343 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008344 if (_PyBytes_Resize(&res, respos) < 0)
8345 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008346
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008347 Py_XDECREF(exc);
8348 Py_XDECREF(errorHandler);
8349 return res;
8350
Benjamin Peterson29060642009-01-31 22:14:21 +00008351 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008352 Py_XDECREF(res);
8353 Py_XDECREF(exc);
8354 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008355 return NULL;
8356}
8357
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008358/* Deprecated */
8359PyObject *
8360PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8361 Py_ssize_t size,
8362 PyObject *mapping,
8363 const char *errors)
8364{
8365 PyObject *result;
8366 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8367 if (unicode == NULL)
8368 return NULL;
8369 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8370 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008371 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008372}
8373
Alexander Belopolsky40018472011-02-26 01:02:56 +00008374PyObject *
8375PyUnicode_AsCharmapString(PyObject *unicode,
8376 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008377{
8378 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008379 PyErr_BadArgument();
8380 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008381 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008382 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008383}
8384
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008385/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008386static void
8387make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008388 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008389 Py_ssize_t startpos, Py_ssize_t endpos,
8390 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008391{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008392 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008393 *exceptionObject = _PyUnicodeTranslateError_Create(
8394 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008395 }
8396 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008397 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8398 goto onError;
8399 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8400 goto onError;
8401 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8402 goto onError;
8403 return;
8404 onError:
8405 Py_DECREF(*exceptionObject);
8406 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008407 }
8408}
8409
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008410/* error handling callback helper:
8411 build arguments, call the callback and check the arguments,
8412 put the result into newpos and return the replacement string, which
8413 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008414static PyObject *
8415unicode_translate_call_errorhandler(const char *errors,
8416 PyObject **errorHandler,
8417 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008418 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008419 Py_ssize_t startpos, Py_ssize_t endpos,
8420 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008421{
Benjamin Peterson142957c2008-07-04 19:55:29 +00008422 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008423
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008424 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008425 PyObject *restuple;
8426 PyObject *resunicode;
8427
8428 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008429 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008430 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008431 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008432 }
8433
8434 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008435 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008436 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008437 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008438
8439 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00008440 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008441 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008442 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008443 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00008444 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008445 Py_DECREF(restuple);
8446 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008447 }
8448 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00008449 &resunicode, &i_newpos)) {
8450 Py_DECREF(restuple);
8451 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008452 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008453 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008454 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008455 else
8456 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008457 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008458 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
8459 Py_DECREF(restuple);
8460 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008461 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008462 Py_INCREF(resunicode);
8463 Py_DECREF(restuple);
8464 return resunicode;
8465}
8466
8467/* Lookup the character ch in the mapping and put the result in result,
8468 which must be decrefed by the caller.
8469 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008470static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008471charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008472{
Christian Heimes217cfd12007-12-02 14:31:20 +00008473 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008474 PyObject *x;
8475
8476 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008477 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008478 x = PyObject_GetItem(mapping, w);
8479 Py_DECREF(w);
8480 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008481 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8482 /* No mapping found means: use 1:1 mapping. */
8483 PyErr_Clear();
8484 *result = NULL;
8485 return 0;
8486 } else
8487 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008488 }
8489 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008490 *result = x;
8491 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008492 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008493 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008494 long value = PyLong_AS_LONG(x);
8495 long max = PyUnicode_GetMax();
8496 if (value < 0 || value > max) {
8497 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00008498 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008499 Py_DECREF(x);
8500 return -1;
8501 }
8502 *result = x;
8503 return 0;
8504 }
8505 else if (PyUnicode_Check(x)) {
8506 *result = x;
8507 return 0;
8508 }
8509 else {
8510 /* wrong return value */
8511 PyErr_SetString(PyExc_TypeError,
8512 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008513 Py_DECREF(x);
8514 return -1;
8515 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008516}
8517/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00008518 if not reallocate and adjust various state variables.
8519 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008520static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008521charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00008522 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008523{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008524 Py_ssize_t oldsize = *psize;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +00008525 Py_UCS4 *new_outobj;
Walter Dörwald4894c302003-10-24 14:25:28 +00008526 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008527 /* exponentially overallocate to minimize reallocations */
8528 if (requiredsize < 2 * oldsize)
8529 requiredsize = 2 * oldsize;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +00008530 new_outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
8531 if (new_outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008532 return -1;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +00008533 *outobj = new_outobj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008534 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008535 }
8536 return 0;
8537}
8538/* lookup the character, put the result in the output string and adjust
8539 various state variables. Return a new reference to the object that
8540 was put in the output buffer in *result, or Py_None, if the mapping was
8541 undefined (in which case no character was written).
8542 The called must decref result.
8543 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008544static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008545charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
8546 PyObject *mapping, Py_UCS4 **output,
8547 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008548 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008549{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008550 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
8551 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00008552 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008553 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008554 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008555 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008556 }
8557 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008558 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00008559 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008560 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008561 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008562 }
8563 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008564 Py_ssize_t repsize;
8565 if (PyUnicode_READY(*res) == -1)
8566 return -1;
8567 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00008568 if (repsize==1) {
8569 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008570 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00008571 }
8572 else if (repsize!=0) {
8573 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008574 Py_ssize_t requiredsize = *opos +
8575 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00008576 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008577 Py_ssize_t i;
8578 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00008579 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008580 for(i = 0; i < repsize; i++)
8581 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00008582 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008583 }
8584 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008585 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008586 return 0;
8587}
8588
Alexander Belopolsky40018472011-02-26 01:02:56 +00008589PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008590_PyUnicode_TranslateCharmap(PyObject *input,
8591 PyObject *mapping,
8592 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008593{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008594 /* input object */
8595 char *idata;
8596 Py_ssize_t size, i;
8597 int kind;
8598 /* output buffer */
8599 Py_UCS4 *output = NULL;
8600 Py_ssize_t osize;
8601 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008602 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008603 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008604 char *reason = "character maps to <undefined>";
8605 PyObject *errorHandler = NULL;
8606 PyObject *exc = NULL;
8607 /* the following variable is used for caching string comparisons
8608 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8609 * 3=ignore, 4=xmlcharrefreplace */
8610 int known_errorHandler = -1;
8611
Guido van Rossumd57fd912000-03-10 22:53:23 +00008612 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008613 PyErr_BadArgument();
8614 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008615 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008616
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008617 if (PyUnicode_READY(input) == -1)
8618 return NULL;
8619 idata = (char*)PyUnicode_DATA(input);
8620 kind = PyUnicode_KIND(input);
8621 size = PyUnicode_GET_LENGTH(input);
8622 i = 0;
8623
8624 if (size == 0) {
8625 Py_INCREF(input);
8626 return input;
8627 }
8628
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008629 /* allocate enough for a simple 1:1 translation without
8630 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008631 osize = size;
8632 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
8633 opos = 0;
8634 if (output == NULL) {
8635 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00008636 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008637 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008638
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008639 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008640 /* try to encode it */
8641 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008642 if (charmaptranslate_output(input, i, mapping,
8643 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008644 Py_XDECREF(x);
8645 goto onError;
8646 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008647 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00008648 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008649 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00008650 else { /* untranslatable character */
8651 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8652 Py_ssize_t repsize;
8653 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008654 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00008655 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008656 Py_ssize_t collstart = i;
8657 Py_ssize_t collend = i+1;
8658 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008659
Benjamin Peterson29060642009-01-31 22:14:21 +00008660 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008661 while (collend < size) {
8662 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008663 goto onError;
8664 Py_XDECREF(x);
8665 if (x!=Py_None)
8666 break;
8667 ++collend;
8668 }
8669 /* cache callback name lookup
8670 * (if not done yet, i.e. it's the first error) */
8671 if (known_errorHandler==-1) {
8672 if ((errors==NULL) || (!strcmp(errors, "strict")))
8673 known_errorHandler = 1;
8674 else if (!strcmp(errors, "replace"))
8675 known_errorHandler = 2;
8676 else if (!strcmp(errors, "ignore"))
8677 known_errorHandler = 3;
8678 else if (!strcmp(errors, "xmlcharrefreplace"))
8679 known_errorHandler = 4;
8680 else
8681 known_errorHandler = 0;
8682 }
8683 switch (known_errorHandler) {
8684 case 1: /* strict */
Victor Stinner6fa62752012-10-23 02:51:50 +02008685 make_translate_exception(&exc,
8686 input, collstart, collend, reason);
8687 if (exc != NULL)
8688 PyCodec_StrictErrors(exc);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008689 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00008690 case 2: /* replace */
8691 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008692 for (coll = collstart; coll<collend; coll++)
8693 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00008694 /* fall through */
8695 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008696 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008697 break;
8698 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008699 /* generate replacement (temporarily (mis)uses i) */
8700 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008701 char buffer[2+29+1+1];
8702 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008703 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
8704 if (charmaptranslate_makespace(&output, &osize,
8705 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00008706 goto onError;
8707 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008708 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00008709 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008710 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008711 break;
8712 default:
8713 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008714 reason, input, &exc,
8715 collstart, collend, &newpos);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008716 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008717 goto onError;
Benjamin Peterson9ca3ffa2012-01-01 16:04:29 -06008718 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008719 Py_DECREF(repunicode);
8720 goto onError;
8721 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008722 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008723 repsize = PyUnicode_GET_LENGTH(repunicode);
8724 if (charmaptranslate_makespace(&output, &osize,
8725 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008726 Py_DECREF(repunicode);
8727 goto onError;
8728 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008729 for (uni2 = 0; repsize-->0; ++uni2)
8730 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
8731 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00008732 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008733 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008734 }
8735 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008736 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
8737 if (!res)
8738 goto onError;
8739 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008740 Py_XDECREF(exc);
8741 Py_XDECREF(errorHandler);
8742 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008743
Benjamin Peterson29060642009-01-31 22:14:21 +00008744 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008745 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008746 Py_XDECREF(exc);
8747 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008748 return NULL;
8749}
8750
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008751/* Deprecated. Use PyUnicode_Translate instead. */
8752PyObject *
8753PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8754 Py_ssize_t size,
8755 PyObject *mapping,
8756 const char *errors)
8757{
Christian Heimes5f520f42012-09-11 14:03:25 +02008758 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008759 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8760 if (!unicode)
8761 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02008762 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8763 Py_DECREF(unicode);
8764 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008765}
8766
Alexander Belopolsky40018472011-02-26 01:02:56 +00008767PyObject *
8768PyUnicode_Translate(PyObject *str,
8769 PyObject *mapping,
8770 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008771{
8772 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008773
Guido van Rossumd57fd912000-03-10 22:53:23 +00008774 str = PyUnicode_FromObject(str);
8775 if (str == NULL)
Christian Heimes5f520f42012-09-11 14:03:25 +02008776 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008777 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008778 Py_DECREF(str);
8779 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008780}
Tim Petersced69f82003-09-16 20:30:58 +00008781
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008782static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008783fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008784{
8785 /* No need to call PyUnicode_READY(self) because this function is only
8786 called as a callback from fixup() which does it already. */
8787 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8788 const int kind = PyUnicode_KIND(self);
8789 void *data = PyUnicode_DATA(self);
Victor Stinnere6abb482012-05-02 01:15:40 +02008790 Py_UCS4 maxchar = 127, ch, fixed;
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008791 int modified = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008792 Py_ssize_t i;
8793
8794 for (i = 0; i < len; ++i) {
8795 ch = PyUnicode_READ(kind, data, i);
8796 fixed = 0;
8797 if (ch > 127) {
8798 if (Py_UNICODE_ISSPACE(ch))
8799 fixed = ' ';
8800 else {
8801 const int decimal = Py_UNICODE_TODECIMAL(ch);
8802 if (decimal >= 0)
8803 fixed = '0' + decimal;
8804 }
8805 if (fixed != 0) {
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008806 modified = 1;
Benjamin Peterson7e303732013-06-10 09:19:46 -07008807 maxchar = Py_MAX(maxchar, fixed);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008808 PyUnicode_WRITE(kind, data, i, fixed);
8809 }
Victor Stinnere6abb482012-05-02 01:15:40 +02008810 else
Benjamin Peterson7e303732013-06-10 09:19:46 -07008811 maxchar = Py_MAX(maxchar, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008812 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008813 }
8814
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008815 return (modified) ? maxchar : 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008816}
8817
8818PyObject *
8819_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8820{
8821 if (!PyUnicode_Check(unicode)) {
8822 PyErr_BadInternalCall();
8823 return NULL;
8824 }
8825 if (PyUnicode_READY(unicode) == -1)
8826 return NULL;
8827 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8828 /* If the string is already ASCII, just return the same string */
8829 Py_INCREF(unicode);
8830 return unicode;
8831 }
Victor Stinner9310abb2011-10-05 00:59:23 +02008832 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008833}
8834
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008835PyObject *
8836PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8837 Py_ssize_t length)
8838{
Victor Stinnerf0124502011-11-21 23:12:56 +01008839 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008840 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01008841 Py_UCS4 maxchar;
8842 enum PyUnicode_Kind kind;
8843 void *data;
8844
Victor Stinner99d7ad02012-02-22 13:37:39 +01008845 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008846 for (i = 0; i < length; i++) {
Victor Stinnerf0124502011-11-21 23:12:56 +01008847 Py_UNICODE ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008848 if (ch > 127) {
8849 int decimal = Py_UNICODE_TODECIMAL(ch);
8850 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01008851 ch = '0' + decimal;
Benjamin Peterson7e303732013-06-10 09:19:46 -07008852 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008853 }
8854 }
Victor Stinnerf0124502011-11-21 23:12:56 +01008855
8856 /* Copy to a new string */
8857 decimal = PyUnicode_New(length, maxchar);
8858 if (decimal == NULL)
8859 return decimal;
8860 kind = PyUnicode_KIND(decimal);
8861 data = PyUnicode_DATA(decimal);
8862 /* Iterate over code points */
8863 for (i = 0; i < length; i++) {
8864 Py_UNICODE ch = s[i];
8865 if (ch > 127) {
8866 int decimal = Py_UNICODE_TODECIMAL(ch);
8867 if (decimal >= 0)
8868 ch = '0' + decimal;
8869 }
8870 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008871 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008872 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008873}
Guido van Rossum9e896b32000-04-05 20:11:21 +00008874/* --- Decimal Encoder ---------------------------------------------------- */
8875
Alexander Belopolsky40018472011-02-26 01:02:56 +00008876int
8877PyUnicode_EncodeDecimal(Py_UNICODE *s,
8878 Py_ssize_t length,
8879 char *output,
8880 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00008881{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008882 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01008883 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01008884 enum PyUnicode_Kind kind;
8885 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008886
8887 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008888 PyErr_BadArgument();
8889 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008890 }
8891
Victor Stinner42bf7752011-11-21 22:52:58 +01008892 unicode = PyUnicode_FromUnicode(s, length);
8893 if (unicode == NULL)
8894 return -1;
8895
Benjamin Petersonbac79492012-01-14 13:34:47 -05008896 if (PyUnicode_READY(unicode) == -1) {
Victor Stinner6345be92011-11-25 20:09:01 +01008897 Py_DECREF(unicode);
8898 return -1;
8899 }
Victor Stinner42bf7752011-11-21 22:52:58 +01008900 kind = PyUnicode_KIND(unicode);
8901 data = PyUnicode_DATA(unicode);
8902
Victor Stinnerb84d7232011-11-22 01:50:07 +01008903 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01008904 PyObject *exc;
8905 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00008906 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01008907 Py_ssize_t startpos;
8908
8909 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00008910
Benjamin Peterson29060642009-01-31 22:14:21 +00008911 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008912 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01008913 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008914 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008915 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008916 decimal = Py_UNICODE_TODECIMAL(ch);
8917 if (decimal >= 0) {
8918 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008919 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008920 continue;
8921 }
8922 if (0 < ch && ch < 256) {
8923 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008924 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008925 continue;
8926 }
Victor Stinner6345be92011-11-25 20:09:01 +01008927
Victor Stinner42bf7752011-11-21 22:52:58 +01008928 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01008929 exc = NULL;
8930 raise_encode_exception(&exc, "decimal", unicode,
8931 startpos, startpos+1,
8932 "invalid decimal Unicode string");
8933 Py_XDECREF(exc);
8934 Py_DECREF(unicode);
8935 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008936 }
8937 /* 0-terminate the output string */
8938 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01008939 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008940 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008941}
8942
Guido van Rossumd57fd912000-03-10 22:53:23 +00008943/* --- Helpers ------------------------------------------------------------ */
8944
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008945static Py_ssize_t
Victor Stinner794d5672011-10-10 03:21:36 +02008946any_find_slice(int direction, PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008947 Py_ssize_t start,
8948 Py_ssize_t end)
8949{
8950 int kind1, kind2, kind;
8951 void *buf1, *buf2;
8952 Py_ssize_t len1, len2, result;
8953
8954 kind1 = PyUnicode_KIND(s1);
8955 kind2 = PyUnicode_KIND(s2);
8956 kind = kind1 > kind2 ? kind1 : kind2;
8957 buf1 = PyUnicode_DATA(s1);
8958 buf2 = PyUnicode_DATA(s2);
8959 if (kind1 != kind)
8960 buf1 = _PyUnicode_AsKind(s1, kind);
8961 if (!buf1)
8962 return -2;
8963 if (kind2 != kind)
8964 buf2 = _PyUnicode_AsKind(s2, kind);
8965 if (!buf2) {
8966 if (kind1 != kind) PyMem_Free(buf1);
8967 return -2;
8968 }
8969 len1 = PyUnicode_GET_LENGTH(s1);
8970 len2 = PyUnicode_GET_LENGTH(s2);
8971
Victor Stinner794d5672011-10-10 03:21:36 +02008972 if (direction > 0) {
Benjamin Petersonead6b532011-12-20 17:23:42 -06008973 switch (kind) {
Victor Stinner794d5672011-10-10 03:21:36 +02008974 case PyUnicode_1BYTE_KIND:
8975 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8976 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
8977 else
8978 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
8979 break;
8980 case PyUnicode_2BYTE_KIND:
8981 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
8982 break;
8983 case PyUnicode_4BYTE_KIND:
8984 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
8985 break;
8986 default:
8987 assert(0); result = -2;
8988 }
8989 }
8990 else {
Benjamin Petersonead6b532011-12-20 17:23:42 -06008991 switch (kind) {
Victor Stinner794d5672011-10-10 03:21:36 +02008992 case PyUnicode_1BYTE_KIND:
8993 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8994 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
8995 else
8996 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8997 break;
8998 case PyUnicode_2BYTE_KIND:
8999 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9000 break;
9001 case PyUnicode_4BYTE_KIND:
9002 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9003 break;
9004 default:
9005 assert(0); result = -2;
9006 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009007 }
9008
9009 if (kind1 != kind)
9010 PyMem_Free(buf1);
9011 if (kind2 != kind)
9012 PyMem_Free(buf2);
9013
9014 return result;
9015}
9016
9017Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01009018_PyUnicode_InsertThousandsGrouping(
9019 PyObject *unicode, Py_ssize_t index,
9020 Py_ssize_t n_buffer,
9021 void *digits, Py_ssize_t n_digits,
9022 Py_ssize_t min_width,
9023 const char *grouping, PyObject *thousands_sep,
9024 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009025{
Victor Stinner41a863c2012-02-24 00:37:51 +01009026 unsigned int kind, thousands_sep_kind;
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009027 char *data, *thousands_sep_data;
Victor Stinner41a863c2012-02-24 00:37:51 +01009028 Py_ssize_t thousands_sep_len;
9029 Py_ssize_t len;
9030
9031 if (unicode != NULL) {
9032 kind = PyUnicode_KIND(unicode);
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009033 data = (char *) PyUnicode_DATA(unicode) + index * kind;
Victor Stinner41a863c2012-02-24 00:37:51 +01009034 }
9035 else {
9036 kind = PyUnicode_1BYTE_KIND;
9037 data = NULL;
9038 }
9039 thousands_sep_kind = PyUnicode_KIND(thousands_sep);
9040 thousands_sep_data = PyUnicode_DATA(thousands_sep);
9041 thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9042 if (unicode != NULL && thousands_sep_kind != kind) {
Victor Stinner90f50d42012-02-24 01:44:47 +01009043 if (thousands_sep_kind < kind) {
9044 thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind);
9045 if (!thousands_sep_data)
9046 return -1;
9047 }
9048 else {
9049 data = _PyUnicode_AsKind(unicode, thousands_sep_kind);
9050 if (!data)
9051 return -1;
9052 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009053 }
9054
Benjamin Petersonead6b532011-12-20 17:23:42 -06009055 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009056 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009057 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
Victor Stinner41a863c2012-02-24 00:37:51 +01009058 len = asciilib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009059 (Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009060 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009061 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009062 else
Victor Stinner41a863c2012-02-24 00:37:51 +01009063 len = ucs1lib_InsertThousandsGrouping(
Victor Stinnerc3cec782011-10-05 21:24:08 +02009064 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009065 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009066 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009067 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009068 case PyUnicode_2BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009069 len = ucs2lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009070 (Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009071 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009072 (Py_UCS2 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009073 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009074 case PyUnicode_4BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009075 len = ucs4lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009076 (Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009077 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009078 (Py_UCS4 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009079 break;
9080 default:
9081 assert(0);
9082 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009083 }
Victor Stinner90f50d42012-02-24 01:44:47 +01009084 if (unicode != NULL && thousands_sep_kind != kind) {
9085 if (thousands_sep_kind < kind)
9086 PyMem_Free(thousands_sep_data);
9087 else
9088 PyMem_Free(data);
9089 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009090 if (unicode == NULL) {
9091 *maxchar = 127;
9092 if (len != n_digits) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009093 *maxchar = Py_MAX(*maxchar,
Victor Stinnere6abb482012-05-02 01:15:40 +02009094 PyUnicode_MAX_CHAR_VALUE(thousands_sep));
Victor Stinner41a863c2012-02-24 00:37:51 +01009095 }
9096 }
9097 return len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009098}
9099
9100
Thomas Wouters477c8d52006-05-27 19:21:47 +00009101/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009102#define ADJUST_INDICES(start, end, len) \
9103 if (end > len) \
9104 end = len; \
9105 else if (end < 0) { \
9106 end += len; \
9107 if (end < 0) \
9108 end = 0; \
9109 } \
9110 if (start < 0) { \
9111 start += len; \
9112 if (start < 0) \
9113 start = 0; \
9114 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009115
Alexander Belopolsky40018472011-02-26 01:02:56 +00009116Py_ssize_t
9117PyUnicode_Count(PyObject *str,
9118 PyObject *substr,
9119 Py_ssize_t start,
9120 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009121{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009122 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009123 PyObject* str_obj;
9124 PyObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009125 int kind1, kind2, kind;
9126 void *buf1 = NULL, *buf2 = NULL;
9127 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009128
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009129 str_obj = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009130 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00009131 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009132 sub_obj = PyUnicode_FromObject(substr);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009133 if (!sub_obj) {
9134 Py_DECREF(str_obj);
9135 return -1;
9136 }
Benjamin Peterson4c13a4a2012-01-02 09:07:38 -06009137 if (PyUnicode_READY(sub_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
Benjamin Peterson5e458f52012-01-02 10:12:13 -06009138 Py_DECREF(sub_obj);
Benjamin Peterson29060642009-01-31 22:14:21 +00009139 Py_DECREF(str_obj);
9140 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009141 }
Tim Petersced69f82003-09-16 20:30:58 +00009142
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009143 kind1 = PyUnicode_KIND(str_obj);
9144 kind2 = PyUnicode_KIND(sub_obj);
Antoine Pitroue45c0c52012-05-12 15:49:07 +02009145 kind = kind1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009146 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009147 buf2 = PyUnicode_DATA(sub_obj);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -05009148 if (kind2 != kind) {
Antoine Pitrou758153b2012-05-12 15:51:51 +02009149 if (kind2 > kind) {
9150 Py_DECREF(sub_obj);
9151 Py_DECREF(str_obj);
Antoine Pitroue45c0c52012-05-12 15:49:07 +02009152 return 0;
Antoine Pitrou758153b2012-05-12 15:51:51 +02009153 }
Victor Stinner7931d9a2011-11-04 00:22:48 +01009154 buf2 = _PyUnicode_AsKind(sub_obj, kind);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -05009155 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009156 if (!buf2)
9157 goto onError;
9158 len1 = PyUnicode_GET_LENGTH(str_obj);
9159 len2 = PyUnicode_GET_LENGTH(sub_obj);
9160
9161 ADJUST_INDICES(start, end, len1);
Benjamin Petersonead6b532011-12-20 17:23:42 -06009162 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009163 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009164 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
9165 result = asciilib_count(
9166 ((Py_UCS1*)buf1) + start, end - start,
9167 buf2, len2, PY_SSIZE_T_MAX
9168 );
9169 else
9170 result = ucs1lib_count(
9171 ((Py_UCS1*)buf1) + start, end - start,
9172 buf2, len2, PY_SSIZE_T_MAX
9173 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009174 break;
9175 case PyUnicode_2BYTE_KIND:
9176 result = ucs2lib_count(
9177 ((Py_UCS2*)buf1) + start, end - start,
9178 buf2, len2, PY_SSIZE_T_MAX
9179 );
9180 break;
9181 case PyUnicode_4BYTE_KIND:
9182 result = ucs4lib_count(
9183 ((Py_UCS4*)buf1) + start, end - start,
9184 buf2, len2, PY_SSIZE_T_MAX
9185 );
9186 break;
9187 default:
9188 assert(0); result = 0;
9189 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009190
9191 Py_DECREF(sub_obj);
9192 Py_DECREF(str_obj);
9193
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009194 if (kind2 != kind)
9195 PyMem_Free(buf2);
9196
Guido van Rossumd57fd912000-03-10 22:53:23 +00009197 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009198 onError:
9199 Py_DECREF(sub_obj);
9200 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009201 if (kind2 != kind && buf2)
9202 PyMem_Free(buf2);
9203 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009204}
9205
Alexander Belopolsky40018472011-02-26 01:02:56 +00009206Py_ssize_t
9207PyUnicode_Find(PyObject *str,
9208 PyObject *sub,
9209 Py_ssize_t start,
9210 Py_ssize_t end,
9211 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009212{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009213 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009214
Guido van Rossumd57fd912000-03-10 22:53:23 +00009215 str = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009216 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00009217 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009218 sub = PyUnicode_FromObject(sub);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009219 if (!sub) {
9220 Py_DECREF(str);
9221 return -2;
9222 }
9223 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
9224 Py_DECREF(sub);
Benjamin Peterson29060642009-01-31 22:14:21 +00009225 Py_DECREF(str);
9226 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009227 }
Tim Petersced69f82003-09-16 20:30:58 +00009228
Victor Stinner794d5672011-10-10 03:21:36 +02009229 result = any_find_slice(direction,
9230 str, sub, start, end
9231 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00009232
Guido van Rossumd57fd912000-03-10 22:53:23 +00009233 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009234 Py_DECREF(sub);
9235
Guido van Rossumd57fd912000-03-10 22:53:23 +00009236 return result;
9237}
9238
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009239Py_ssize_t
9240PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9241 Py_ssize_t start, Py_ssize_t end,
9242 int direction)
9243{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009244 int kind;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009245 Py_ssize_t result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009246 if (PyUnicode_READY(str) == -1)
9247 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02009248 if (start < 0 || end < 0) {
9249 PyErr_SetString(PyExc_IndexError, "string index out of range");
9250 return -2;
9251 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009252 if (end > PyUnicode_GET_LENGTH(str))
9253 end = PyUnicode_GET_LENGTH(str);
9254 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009255 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9256 kind, end-start, ch, direction);
9257 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009258 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009259 else
9260 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009261}
9262
Alexander Belopolsky40018472011-02-26 01:02:56 +00009263static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009264tailmatch(PyObject *self,
9265 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009266 Py_ssize_t start,
9267 Py_ssize_t end,
9268 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009269{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009270 int kind_self;
9271 int kind_sub;
9272 void *data_self;
9273 void *data_sub;
9274 Py_ssize_t offset;
9275 Py_ssize_t i;
9276 Py_ssize_t end_sub;
9277
9278 if (PyUnicode_READY(self) == -1 ||
9279 PyUnicode_READY(substring) == -1)
Victor Stinner18aa4472013-01-03 03:18:09 +01009280 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009281
9282 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009283 return 1;
9284
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009285 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9286 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009287 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009288 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009289
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009290 kind_self = PyUnicode_KIND(self);
9291 data_self = PyUnicode_DATA(self);
9292 kind_sub = PyUnicode_KIND(substring);
9293 data_sub = PyUnicode_DATA(substring);
9294 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9295
9296 if (direction > 0)
9297 offset = end;
9298 else
9299 offset = start;
9300
9301 if (PyUnicode_READ(kind_self, data_self, offset) ==
9302 PyUnicode_READ(kind_sub, data_sub, 0) &&
9303 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9304 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9305 /* If both are of the same kind, memcmp is sufficient */
9306 if (kind_self == kind_sub) {
9307 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009308 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009309 data_sub,
9310 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009311 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009312 }
9313 /* otherwise we have to compare each character by first accesing it */
9314 else {
9315 /* We do not need to compare 0 and len(substring)-1 because
9316 the if statement above ensured already that they are equal
9317 when we end up here. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009318 for (i = 1; i < end_sub; ++i) {
9319 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9320 PyUnicode_READ(kind_sub, data_sub, i))
9321 return 0;
9322 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009323 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009324 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009325 }
9326
9327 return 0;
9328}
9329
Alexander Belopolsky40018472011-02-26 01:02:56 +00009330Py_ssize_t
9331PyUnicode_Tailmatch(PyObject *str,
9332 PyObject *substr,
9333 Py_ssize_t start,
9334 Py_ssize_t end,
9335 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009336{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009337 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009338
Guido van Rossumd57fd912000-03-10 22:53:23 +00009339 str = PyUnicode_FromObject(str);
9340 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009341 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009342 substr = PyUnicode_FromObject(substr);
9343 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009344 Py_DECREF(str);
9345 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009346 }
Tim Petersced69f82003-09-16 20:30:58 +00009347
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009348 result = tailmatch(str, substr,
Benjamin Peterson29060642009-01-31 22:14:21 +00009349 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009350 Py_DECREF(str);
9351 Py_DECREF(substr);
9352 return result;
9353}
9354
Guido van Rossumd57fd912000-03-10 22:53:23 +00009355/* Apply fixfct filter to the Unicode object self and return a
9356 reference to the modified object */
9357
Alexander Belopolsky40018472011-02-26 01:02:56 +00009358static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009359fixup(PyObject *self,
9360 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009361{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009362 PyObject *u;
9363 Py_UCS4 maxchar_old, maxchar_new = 0;
Victor Stinnereaab6042011-12-11 22:22:39 +01009364 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009365
Victor Stinnerbf6e5602011-12-12 01:53:47 +01009366 u = _PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009367 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009368 return NULL;
Victor Stinner87af4f22011-11-21 23:03:47 +01009369 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009370
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009371 /* fix functions return the new maximum character in a string,
9372 if the kind of the resulting unicode object does not change,
9373 everything is fine. Otherwise we need to change the string kind
9374 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009375 maxchar_new = fixfct(u);
Victor Stinnereaab6042011-12-11 22:22:39 +01009376
9377 if (maxchar_new == 0) {
9378 /* no changes */;
9379 if (PyUnicode_CheckExact(self)) {
9380 Py_DECREF(u);
9381 Py_INCREF(self);
9382 return self;
9383 }
9384 else
9385 return u;
9386 }
9387
Victor Stinnere6abb482012-05-02 01:15:40 +02009388 maxchar_new = align_maxchar(maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009389
Victor Stinnereaab6042011-12-11 22:22:39 +01009390 if (maxchar_new == maxchar_old)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009391 return u;
Victor Stinnereaab6042011-12-11 22:22:39 +01009392
9393 /* In case the maximum character changed, we need to
9394 convert the string to the new category. */
9395 v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
9396 if (v == NULL) {
9397 Py_DECREF(u);
9398 return NULL;
9399 }
9400 if (maxchar_new > maxchar_old) {
9401 /* If the maxchar increased so that the kind changed, not all
9402 characters are representable anymore and we need to fix the
9403 string again. This only happens in very few cases. */
Victor Stinnerd3f08822012-05-29 12:57:52 +02009404 _PyUnicode_FastCopyCharacters(v, 0,
9405 self, 0, PyUnicode_GET_LENGTH(self));
Victor Stinnereaab6042011-12-11 22:22:39 +01009406 maxchar_old = fixfct(v);
9407 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009408 }
9409 else {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009410 _PyUnicode_FastCopyCharacters(v, 0,
9411 u, 0, PyUnicode_GET_LENGTH(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009412 }
Victor Stinnereaab6042011-12-11 22:22:39 +01009413 Py_DECREF(u);
9414 assert(_PyUnicode_CheckConsistency(v, 1));
9415 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009416}
9417
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009418static PyObject *
9419ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009420{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009421 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9422 char *resdata, *data = PyUnicode_DATA(self);
9423 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009424
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009425 res = PyUnicode_New(len, 127);
9426 if (res == NULL)
9427 return NULL;
9428 resdata = PyUnicode_DATA(res);
9429 if (lower)
9430 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009431 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009432 _Py_bytes_upper(resdata, data, len);
9433 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009434}
9435
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009436static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009437handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009438{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009439 Py_ssize_t j;
9440 int final_sigma;
9441 Py_UCS4 c;
9442 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009443
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009444 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9445
9446 where ! is a negation and \p{xxx} is a character with property xxx.
9447 */
9448 for (j = i - 1; j >= 0; j--) {
9449 c = PyUnicode_READ(kind, data, j);
9450 if (!_PyUnicode_IsCaseIgnorable(c))
9451 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009452 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009453 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9454 if (final_sigma) {
9455 for (j = i + 1; j < length; j++) {
9456 c = PyUnicode_READ(kind, data, j);
9457 if (!_PyUnicode_IsCaseIgnorable(c))
9458 break;
9459 }
9460 final_sigma = j == length || !_PyUnicode_IsCased(c);
9461 }
9462 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009463}
9464
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009465static int
9466lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9467 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009468{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009469 /* Obscure special case. */
9470 if (c == 0x3A3) {
9471 mapped[0] = handle_capital_sigma(kind, data, length, i);
9472 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009473 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009474 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009475}
9476
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009477static Py_ssize_t
9478do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009479{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009480 Py_ssize_t i, k = 0;
9481 int n_res, j;
9482 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009483
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009484 c = PyUnicode_READ(kind, data, 0);
9485 n_res = _PyUnicode_ToUpperFull(c, mapped);
9486 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009487 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009488 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009489 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009490 for (i = 1; i < length; i++) {
9491 c = PyUnicode_READ(kind, data, i);
9492 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9493 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009494 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009495 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009496 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009497 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009498 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009499}
9500
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009501static Py_ssize_t
9502do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9503 Py_ssize_t i, k = 0;
9504
9505 for (i = 0; i < length; i++) {
9506 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9507 int n_res, j;
9508 if (Py_UNICODE_ISUPPER(c)) {
9509 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9510 }
9511 else if (Py_UNICODE_ISLOWER(c)) {
9512 n_res = _PyUnicode_ToUpperFull(c, mapped);
9513 }
9514 else {
9515 n_res = 1;
9516 mapped[0] = c;
9517 }
9518 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009519 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009520 res[k++] = mapped[j];
9521 }
9522 }
9523 return k;
9524}
9525
9526static Py_ssize_t
9527do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9528 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009529{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009530 Py_ssize_t i, k = 0;
9531
9532 for (i = 0; i < length; i++) {
9533 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9534 int n_res, j;
9535 if (lower)
9536 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9537 else
9538 n_res = _PyUnicode_ToUpperFull(c, mapped);
9539 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009540 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009541 res[k++] = mapped[j];
9542 }
9543 }
9544 return k;
9545}
9546
9547static Py_ssize_t
9548do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9549{
9550 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9551}
9552
9553static Py_ssize_t
9554do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9555{
9556 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9557}
9558
Benjamin Petersone51757f2012-01-12 21:10:29 -05009559static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009560do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9561{
9562 Py_ssize_t i, k = 0;
9563
9564 for (i = 0; i < length; i++) {
9565 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9566 Py_UCS4 mapped[3];
9567 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9568 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009569 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -05009570 res[k++] = mapped[j];
9571 }
9572 }
9573 return k;
9574}
9575
9576static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009577do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9578{
9579 Py_ssize_t i, k = 0;
9580 int previous_is_cased;
9581
9582 previous_is_cased = 0;
9583 for (i = 0; i < length; i++) {
9584 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9585 Py_UCS4 mapped[3];
9586 int n_res, j;
9587
9588 if (previous_is_cased)
9589 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9590 else
9591 n_res = _PyUnicode_ToTitleFull(c, mapped);
9592
9593 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009594 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -05009595 res[k++] = mapped[j];
9596 }
9597
9598 previous_is_cased = _PyUnicode_IsCased(c);
9599 }
9600 return k;
9601}
9602
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009603static PyObject *
9604case_operation(PyObject *self,
9605 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9606{
9607 PyObject *res = NULL;
9608 Py_ssize_t length, newlength = 0;
9609 int kind, outkind;
9610 void *data, *outdata;
9611 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9612
Benjamin Petersoneea48462012-01-16 14:28:50 -05009613 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009614
9615 kind = PyUnicode_KIND(self);
9616 data = PyUnicode_DATA(self);
9617 length = PyUnicode_GET_LENGTH(self);
9618 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
9619 if (tmp == NULL)
9620 return PyErr_NoMemory();
9621 newlength = perform(kind, data, length, tmp, &maxchar);
9622 res = PyUnicode_New(newlength, maxchar);
9623 if (res == NULL)
9624 goto leave;
9625 tmpend = tmp + newlength;
9626 outdata = PyUnicode_DATA(res);
9627 outkind = PyUnicode_KIND(res);
9628 switch (outkind) {
9629 case PyUnicode_1BYTE_KIND:
9630 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9631 break;
9632 case PyUnicode_2BYTE_KIND:
9633 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9634 break;
9635 case PyUnicode_4BYTE_KIND:
9636 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9637 break;
9638 default:
9639 assert(0);
9640 break;
9641 }
9642 leave:
9643 PyMem_FREE(tmp);
9644 return res;
9645}
9646
Tim Peters8ce9f162004-08-27 01:49:32 +00009647PyObject *
9648PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009649{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009650 PyObject *sep = NULL;
Victor Stinnerdd077322011-10-07 17:02:31 +02009651 Py_ssize_t seplen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009652 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00009653 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009654 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
9655 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00009656 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009657 Py_ssize_t sz, i, res_offset;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009658 Py_UCS4 maxchar;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009659 Py_UCS4 item_maxchar;
Victor Stinnerdd077322011-10-07 17:02:31 +02009660 int use_memcpy;
9661 unsigned char *res_data = NULL, *sep_data = NULL;
9662 PyObject *last_obj;
9663 unsigned int kind = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009664
Tim Peters05eba1f2004-08-27 21:32:02 +00009665 fseq = PySequence_Fast(seq, "");
9666 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009667 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009668 }
9669
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009670 /* NOTE: the following code can't call back into Python code,
9671 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009672 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009673
Tim Peters05eba1f2004-08-27 21:32:02 +00009674 seqlen = PySequence_Fast_GET_SIZE(fseq);
9675 /* If empty sequence, return u"". */
9676 if (seqlen == 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009677 Py_DECREF(fseq);
Serhiy Storchaka678db842013-01-26 12:16:36 +02009678 _Py_RETURN_UNICODE_EMPTY();
Tim Peters05eba1f2004-08-27 21:32:02 +00009679 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009680
Tim Peters05eba1f2004-08-27 21:32:02 +00009681 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009682 last_obj = NULL;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009683 items = PySequence_Fast_ITEMS(fseq);
Victor Stinneracf47b82011-10-06 12:32:37 +02009684 if (seqlen == 1) {
9685 if (PyUnicode_CheckExact(items[0])) {
9686 res = items[0];
9687 Py_INCREF(res);
9688 Py_DECREF(fseq);
9689 return res;
9690 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009691 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009692 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009693 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009694 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009695 /* Set up sep and seplen */
9696 if (separator == NULL) {
9697 /* fall back to a blank space separator */
9698 sep = PyUnicode_FromOrdinal(' ');
9699 if (!sep)
9700 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009701 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009702 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009703 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009704 else {
9705 if (!PyUnicode_Check(separator)) {
9706 PyErr_Format(PyExc_TypeError,
9707 "separator: expected str instance,"
9708 " %.80s found",
9709 Py_TYPE(separator)->tp_name);
9710 goto onError;
9711 }
9712 if (PyUnicode_READY(separator))
9713 goto onError;
9714 sep = separator;
9715 seplen = PyUnicode_GET_LENGTH(separator);
9716 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9717 /* inc refcount to keep this code path symmetric with the
9718 above case of a blank separator */
9719 Py_INCREF(sep);
9720 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009721 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009722 }
9723
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009724 /* There are at least two things to join, or else we have a subclass
9725 * of str in the sequence.
9726 * Do a pre-pass to figure out the total amount of space we'll
9727 * need (sz), and see whether all argument are strings.
9728 */
9729 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009730#ifdef Py_DEBUG
9731 use_memcpy = 0;
9732#else
9733 use_memcpy = 1;
9734#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009735 for (i = 0; i < seqlen; i++) {
9736 const Py_ssize_t old_sz = sz;
9737 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009738 if (!PyUnicode_Check(item)) {
9739 PyErr_Format(PyExc_TypeError,
9740 "sequence item %zd: expected str instance,"
9741 " %.80s found",
9742 i, Py_TYPE(item)->tp_name);
9743 goto onError;
9744 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009745 if (PyUnicode_READY(item) == -1)
9746 goto onError;
9747 sz += PyUnicode_GET_LENGTH(item);
9748 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Benjamin Peterson7e303732013-06-10 09:19:46 -07009749 maxchar = Py_MAX(maxchar, item_maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009750 if (i != 0)
9751 sz += seplen;
9752 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9753 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009754 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009755 goto onError;
9756 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009757 if (use_memcpy && last_obj != NULL) {
9758 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9759 use_memcpy = 0;
9760 }
9761 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009762 }
Tim Petersced69f82003-09-16 20:30:58 +00009763
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009764 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009765 if (res == NULL)
9766 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009767
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009768 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009769#ifdef Py_DEBUG
9770 use_memcpy = 0;
9771#else
9772 if (use_memcpy) {
9773 res_data = PyUnicode_1BYTE_DATA(res);
9774 kind = PyUnicode_KIND(res);
9775 if (seplen != 0)
9776 sep_data = PyUnicode_1BYTE_DATA(sep);
9777 }
9778#endif
Victor Stinner4560f9c2013-04-14 18:56:46 +02009779 if (use_memcpy) {
9780 for (i = 0; i < seqlen; ++i) {
9781 Py_ssize_t itemlen;
9782 item = items[i];
9783
9784 /* Copy item, and maybe the separator. */
9785 if (i && seplen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009786 Py_MEMCPY(res_data,
9787 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009788 kind * seplen);
9789 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009790 }
Victor Stinner4560f9c2013-04-14 18:56:46 +02009791
9792 itemlen = PyUnicode_GET_LENGTH(item);
9793 if (itemlen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009794 Py_MEMCPY(res_data,
9795 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009796 kind * itemlen);
9797 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009798 }
Victor Stinner4560f9c2013-04-14 18:56:46 +02009799 }
9800 assert(res_data == PyUnicode_1BYTE_DATA(res)
9801 + kind * PyUnicode_GET_LENGTH(res));
9802 }
9803 else {
9804 for (i = 0, res_offset = 0; i < seqlen; ++i) {
9805 Py_ssize_t itemlen;
9806 item = items[i];
9807
9808 /* Copy item, and maybe the separator. */
9809 if (i && seplen != 0) {
9810 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
9811 res_offset += seplen;
9812 }
9813
9814 itemlen = PyUnicode_GET_LENGTH(item);
9815 if (itemlen != 0) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009816 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +02009817 res_offset += itemlen;
9818 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009819 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009820 assert(res_offset == PyUnicode_GET_LENGTH(res));
Victor Stinner4560f9c2013-04-14 18:56:46 +02009821 }
Tim Peters8ce9f162004-08-27 01:49:32 +00009822
Tim Peters05eba1f2004-08-27 21:32:02 +00009823 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009824 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009825 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009826 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009827
Benjamin Peterson29060642009-01-31 22:14:21 +00009828 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00009829 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009830 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00009831 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009832 return NULL;
9833}
9834
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009835#define FILL(kind, data, value, start, length) \
9836 do { \
9837 Py_ssize_t i_ = 0; \
9838 assert(kind != PyUnicode_WCHAR_KIND); \
9839 switch ((kind)) { \
9840 case PyUnicode_1BYTE_KIND: { \
9841 unsigned char * to_ = (unsigned char *)((data)) + (start); \
Victor Stinnerf2c76aa2012-05-03 13:10:40 +02009842 memset(to_, (unsigned char)value, (length)); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009843 break; \
9844 } \
9845 case PyUnicode_2BYTE_KIND: { \
9846 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
9847 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9848 break; \
9849 } \
Benjamin Petersone157cf12012-01-01 15:56:20 -06009850 case PyUnicode_4BYTE_KIND: { \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009851 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9852 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9853 break; \
Benjamin Petersone157cf12012-01-01 15:56:20 -06009854 default: assert(0); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009855 } \
9856 } \
9857 } while (0)
9858
Victor Stinnerd3f08822012-05-29 12:57:52 +02009859void
9860_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9861 Py_UCS4 fill_char)
9862{
9863 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
9864 const void *data = PyUnicode_DATA(unicode);
9865 assert(PyUnicode_IS_READY(unicode));
9866 assert(unicode_modifiable(unicode));
9867 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
9868 assert(start >= 0);
9869 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
9870 FILL(kind, data, fill_char, start, length);
9871}
9872
Victor Stinner3fe55312012-01-04 00:33:50 +01009873Py_ssize_t
9874PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9875 Py_UCS4 fill_char)
9876{
9877 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +01009878
9879 if (!PyUnicode_Check(unicode)) {
9880 PyErr_BadInternalCall();
9881 return -1;
9882 }
9883 if (PyUnicode_READY(unicode) == -1)
9884 return -1;
9885 if (unicode_check_modifiable(unicode))
9886 return -1;
9887
Victor Stinnerd3f08822012-05-29 12:57:52 +02009888 if (start < 0) {
9889 PyErr_SetString(PyExc_IndexError, "string index out of range");
9890 return -1;
9891 }
Victor Stinner3fe55312012-01-04 00:33:50 +01009892 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
9893 PyErr_SetString(PyExc_ValueError,
9894 "fill character is bigger than "
9895 "the string maximum character");
9896 return -1;
9897 }
9898
9899 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
9900 length = Py_MIN(maxlen, length);
9901 if (length <= 0)
9902 return 0;
9903
Victor Stinnerd3f08822012-05-29 12:57:52 +02009904 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +01009905 return length;
9906}
9907
Victor Stinner9310abb2011-10-05 00:59:23 +02009908static PyObject *
9909pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009910 Py_ssize_t left,
9911 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009912 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009913{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009914 PyObject *u;
9915 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009916 int kind;
9917 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009918
9919 if (left < 0)
9920 left = 0;
9921 if (right < 0)
9922 right = 0;
9923
Victor Stinnerc4b49542011-12-11 22:44:26 +01009924 if (left == 0 && right == 0)
9925 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009926
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009927 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9928 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00009929 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9930 return NULL;
9931 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009932 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Benjamin Peterson7e303732013-06-10 09:19:46 -07009933 maxchar = Py_MAX(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009934 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009935 if (!u)
9936 return NULL;
9937
9938 kind = PyUnicode_KIND(u);
9939 data = PyUnicode_DATA(u);
9940 if (left)
9941 FILL(kind, data, fill, 0, left);
9942 if (right)
9943 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +02009944 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009945 assert(_PyUnicode_CheckConsistency(u, 1));
9946 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009947}
9948
Alexander Belopolsky40018472011-02-26 01:02:56 +00009949PyObject *
9950PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009951{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009952 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009953
9954 string = PyUnicode_FromObject(string);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009955 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009956 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -06009957 if (PyUnicode_READY(string) == -1) {
9958 Py_DECREF(string);
9959 return NULL;
9960 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009961
Benjamin Petersonead6b532011-12-20 17:23:42 -06009962 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009963 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009964 if (PyUnicode_IS_ASCII(string))
9965 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009966 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009967 PyUnicode_GET_LENGTH(string), keepends);
9968 else
9969 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009970 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009971 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009972 break;
9973 case PyUnicode_2BYTE_KIND:
9974 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009975 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009976 PyUnicode_GET_LENGTH(string), keepends);
9977 break;
9978 case PyUnicode_4BYTE_KIND:
9979 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009980 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009981 PyUnicode_GET_LENGTH(string), keepends);
9982 break;
9983 default:
9984 assert(0);
9985 list = 0;
9986 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009987 Py_DECREF(string);
9988 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009989}
9990
Alexander Belopolsky40018472011-02-26 01:02:56 +00009991static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009992split(PyObject *self,
9993 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009994 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009995{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009996 int kind1, kind2, kind;
9997 void *buf1, *buf2;
9998 Py_ssize_t len1, len2;
9999 PyObject* out;
10000
Guido van Rossumd57fd912000-03-10 22:53:23 +000010001 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010002 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010003
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010004 if (PyUnicode_READY(self) == -1)
10005 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010006
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010007 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010008 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010009 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010010 if (PyUnicode_IS_ASCII(self))
10011 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010012 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010013 PyUnicode_GET_LENGTH(self), maxcount
10014 );
10015 else
10016 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010017 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010018 PyUnicode_GET_LENGTH(self), maxcount
10019 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010020 case PyUnicode_2BYTE_KIND:
10021 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010022 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010023 PyUnicode_GET_LENGTH(self), maxcount
10024 );
10025 case PyUnicode_4BYTE_KIND:
10026 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010027 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010028 PyUnicode_GET_LENGTH(self), maxcount
10029 );
10030 default:
10031 assert(0);
10032 return NULL;
10033 }
10034
10035 if (PyUnicode_READY(substring) == -1)
10036 return NULL;
10037
10038 kind1 = PyUnicode_KIND(self);
10039 kind2 = PyUnicode_KIND(substring);
10040 kind = kind1 > kind2 ? kind1 : kind2;
10041 buf1 = PyUnicode_DATA(self);
10042 buf2 = PyUnicode_DATA(substring);
10043 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010044 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010045 if (!buf1)
10046 return NULL;
10047 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010048 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010049 if (!buf2) {
10050 if (kind1 != kind) PyMem_Free(buf1);
10051 return NULL;
10052 }
10053 len1 = PyUnicode_GET_LENGTH(self);
10054 len2 = PyUnicode_GET_LENGTH(substring);
10055
Benjamin Petersonead6b532011-12-20 17:23:42 -060010056 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010057 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010058 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10059 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010060 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010061 else
10062 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010063 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010064 break;
10065 case PyUnicode_2BYTE_KIND:
10066 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010067 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010068 break;
10069 case PyUnicode_4BYTE_KIND:
10070 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010071 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010072 break;
10073 default:
10074 out = NULL;
10075 }
10076 if (kind1 != kind)
10077 PyMem_Free(buf1);
10078 if (kind2 != kind)
10079 PyMem_Free(buf2);
10080 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010081}
10082
Alexander Belopolsky40018472011-02-26 01:02:56 +000010083static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010084rsplit(PyObject *self,
10085 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010086 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010087{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010088 int kind1, kind2, kind;
10089 void *buf1, *buf2;
10090 Py_ssize_t len1, len2;
10091 PyObject* out;
10092
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010093 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010094 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010095
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010096 if (PyUnicode_READY(self) == -1)
10097 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010098
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010099 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010100 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010101 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010102 if (PyUnicode_IS_ASCII(self))
10103 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010104 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010105 PyUnicode_GET_LENGTH(self), maxcount
10106 );
10107 else
10108 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010109 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010110 PyUnicode_GET_LENGTH(self), maxcount
10111 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010112 case PyUnicode_2BYTE_KIND:
10113 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010114 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010115 PyUnicode_GET_LENGTH(self), maxcount
10116 );
10117 case PyUnicode_4BYTE_KIND:
10118 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010119 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010120 PyUnicode_GET_LENGTH(self), maxcount
10121 );
10122 default:
10123 assert(0);
10124 return NULL;
10125 }
10126
10127 if (PyUnicode_READY(substring) == -1)
10128 return NULL;
10129
10130 kind1 = PyUnicode_KIND(self);
10131 kind2 = PyUnicode_KIND(substring);
10132 kind = kind1 > kind2 ? kind1 : kind2;
10133 buf1 = PyUnicode_DATA(self);
10134 buf2 = PyUnicode_DATA(substring);
10135 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010136 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010137 if (!buf1)
10138 return NULL;
10139 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010140 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010141 if (!buf2) {
10142 if (kind1 != kind) PyMem_Free(buf1);
10143 return NULL;
10144 }
10145 len1 = PyUnicode_GET_LENGTH(self);
10146 len2 = PyUnicode_GET_LENGTH(substring);
10147
Benjamin Petersonead6b532011-12-20 17:23:42 -060010148 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010149 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010150 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10151 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010152 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010153 else
10154 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010155 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010156 break;
10157 case PyUnicode_2BYTE_KIND:
10158 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010159 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010160 break;
10161 case PyUnicode_4BYTE_KIND:
10162 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010163 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010164 break;
10165 default:
10166 out = NULL;
10167 }
10168 if (kind1 != kind)
10169 PyMem_Free(buf1);
10170 if (kind2 != kind)
10171 PyMem_Free(buf2);
10172 return out;
10173}
10174
10175static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010176anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10177 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010178{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010179 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010180 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010181 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10182 return asciilib_find(buf1, len1, buf2, len2, offset);
10183 else
10184 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010185 case PyUnicode_2BYTE_KIND:
10186 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10187 case PyUnicode_4BYTE_KIND:
10188 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10189 }
10190 assert(0);
10191 return -1;
10192}
10193
10194static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010195anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10196 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010197{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010198 switch (kind) {
10199 case PyUnicode_1BYTE_KIND:
10200 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10201 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10202 else
10203 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10204 case PyUnicode_2BYTE_KIND:
10205 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10206 case PyUnicode_4BYTE_KIND:
10207 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10208 }
10209 assert(0);
10210 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010211}
10212
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010213static void
10214replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10215 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10216{
10217 int kind = PyUnicode_KIND(u);
10218 void *data = PyUnicode_DATA(u);
10219 Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10220 if (kind == PyUnicode_1BYTE_KIND) {
10221 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10222 (Py_UCS1 *)data + len,
10223 u1, u2, maxcount);
10224 }
10225 else if (kind == PyUnicode_2BYTE_KIND) {
10226 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10227 (Py_UCS2 *)data + len,
10228 u1, u2, maxcount);
10229 }
10230 else {
10231 assert(kind == PyUnicode_4BYTE_KIND);
10232 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10233 (Py_UCS4 *)data + len,
10234 u1, u2, maxcount);
10235 }
10236}
10237
Alexander Belopolsky40018472011-02-26 01:02:56 +000010238static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010239replace(PyObject *self, PyObject *str1,
10240 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010241{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010242 PyObject *u;
10243 char *sbuf = PyUnicode_DATA(self);
10244 char *buf1 = PyUnicode_DATA(str1);
10245 char *buf2 = PyUnicode_DATA(str2);
10246 int srelease = 0, release1 = 0, release2 = 0;
10247 int skind = PyUnicode_KIND(self);
10248 int kind1 = PyUnicode_KIND(str1);
10249 int kind2 = PyUnicode_KIND(str2);
10250 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10251 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10252 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010253 int mayshrink;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010254 Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010255
10256 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010257 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010258 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010259 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010260
Victor Stinner59de0ee2011-10-07 10:01:28 +020010261 if (str1 == str2)
10262 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010263
Victor Stinner49a0a212011-10-12 23:46:10 +020010264 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010265 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10266 if (maxchar < maxchar_str1)
10267 /* substring too wide to be present */
10268 goto nothing;
Victor Stinner49a0a212011-10-12 23:46:10 +020010269 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10270 /* Replacing str1 with str2 may cause a maxchar reduction in the
10271 result string. */
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010272 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010273 maxchar = Py_MAX(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010274
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010275 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010276 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010277 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010278 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010279 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010280 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010281 Py_UCS4 u1, u2;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010282 Py_ssize_t pos;
Victor Stinnerf6441102011-12-18 02:43:08 +010010283
Victor Stinner69ed0f42013-04-09 21:48:24 +020010284 u1 = PyUnicode_READ(kind1, buf1, 0);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010285 pos = findchar(sbuf, skind, slen, u1, 1);
Victor Stinnerf6441102011-12-18 02:43:08 +010010286 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010287 goto nothing;
Victor Stinner69ed0f42013-04-09 21:48:24 +020010288 u2 = PyUnicode_READ(kind2, buf2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010289 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010290 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010291 goto error;
Victor Stinnerf6441102011-12-18 02:43:08 +010010292
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010293 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10294 replace_1char_inplace(u, pos, u1, u2, maxcount);
Victor Stinner49a0a212011-10-12 23:46:10 +020010295 }
10296 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010297 int rkind = skind;
10298 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010299 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010300
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010301 if (kind1 < rkind) {
10302 /* widen substring */
10303 buf1 = _PyUnicode_AsKind(str1, rkind);
10304 if (!buf1) goto error;
10305 release1 = 1;
10306 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010307 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010308 if (i < 0)
10309 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010310 if (rkind > kind2) {
10311 /* widen replacement */
10312 buf2 = _PyUnicode_AsKind(str2, rkind);
10313 if (!buf2) goto error;
10314 release2 = 1;
10315 }
10316 else if (rkind < kind2) {
10317 /* widen self and buf1 */
10318 rkind = kind2;
10319 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010320 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010321 sbuf = _PyUnicode_AsKind(self, rkind);
10322 if (!sbuf) goto error;
10323 srelease = 1;
10324 buf1 = _PyUnicode_AsKind(str1, rkind);
10325 if (!buf1) goto error;
10326 release1 = 1;
10327 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010328 u = PyUnicode_New(slen, maxchar);
10329 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010330 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010331 assert(PyUnicode_KIND(u) == rkind);
10332 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010333
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010334 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010335 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010336 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010337 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010338 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010339 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010340
10341 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010342 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010343 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010344 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010345 if (i == -1)
10346 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010347 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010348 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010349 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010350 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010351 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010352 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010353 }
10354 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010355 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010356 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010357 int rkind = skind;
10358 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010359
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010360 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010361 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010362 buf1 = _PyUnicode_AsKind(str1, rkind);
10363 if (!buf1) goto error;
10364 release1 = 1;
10365 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010366 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010367 if (n == 0)
10368 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010369 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010370 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010371 buf2 = _PyUnicode_AsKind(str2, rkind);
10372 if (!buf2) goto error;
10373 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010374 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010375 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010376 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010377 rkind = kind2;
10378 sbuf = _PyUnicode_AsKind(self, rkind);
10379 if (!sbuf) goto error;
10380 srelease = 1;
10381 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010382 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010383 buf1 = _PyUnicode_AsKind(str1, rkind);
10384 if (!buf1) goto error;
10385 release1 = 1;
10386 }
10387 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10388 PyUnicode_GET_LENGTH(str1))); */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010389 if (len2 > len1 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010390 PyErr_SetString(PyExc_OverflowError,
10391 "replace string is too long");
10392 goto error;
10393 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010394 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010395 if (new_size == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010396 _Py_INCREF_UNICODE_EMPTY();
10397 if (!unicode_empty)
10398 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010399 u = unicode_empty;
10400 goto done;
10401 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010402 if (new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010403 PyErr_SetString(PyExc_OverflowError,
10404 "replace string is too long");
10405 goto error;
10406 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010407 u = PyUnicode_New(new_size, maxchar);
10408 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010409 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010410 assert(PyUnicode_KIND(u) == rkind);
10411 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010412 ires = i = 0;
10413 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010414 while (n-- > 0) {
10415 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010416 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010417 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010418 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010419 if (j == -1)
10420 break;
10421 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010422 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010423 memcpy(res + rkind * ires,
10424 sbuf + rkind * i,
10425 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010426 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010427 }
10428 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010429 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010430 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010431 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010432 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010433 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010434 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010435 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010436 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010437 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010438 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010439 memcpy(res + rkind * ires,
10440 sbuf + rkind * i,
10441 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010442 }
10443 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010444 /* interleave */
10445 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010446 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010447 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010448 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010449 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010450 if (--n <= 0)
10451 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010452 memcpy(res + rkind * ires,
10453 sbuf + rkind * i,
10454 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010455 ires++;
10456 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010457 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010458 memcpy(res + rkind * ires,
10459 sbuf + rkind * i,
10460 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010461 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010462 }
10463
10464 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010465 unicode_adjust_maxchar(&u);
10466 if (u == NULL)
10467 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010468 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010469
10470 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010471 if (srelease)
10472 PyMem_FREE(sbuf);
10473 if (release1)
10474 PyMem_FREE(buf1);
10475 if (release2)
10476 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010477 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010478 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010479
Benjamin Peterson29060642009-01-31 22:14:21 +000010480 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010481 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010482 if (srelease)
10483 PyMem_FREE(sbuf);
10484 if (release1)
10485 PyMem_FREE(buf1);
10486 if (release2)
10487 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010488 return unicode_result_unchanged(self);
10489
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010490 error:
10491 if (srelease && sbuf)
10492 PyMem_FREE(sbuf);
10493 if (release1 && buf1)
10494 PyMem_FREE(buf1);
10495 if (release2 && buf2)
10496 PyMem_FREE(buf2);
10497 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010498}
10499
10500/* --- Unicode Object Methods --------------------------------------------- */
10501
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010502PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010503 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010504\n\
10505Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010506characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010507
10508static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010509unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010510{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010511 if (PyUnicode_READY(self) == -1)
10512 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010513 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010514}
10515
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010516PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010517 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010518\n\
10519Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +000010520have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010521
10522static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010523unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010524{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010525 if (PyUnicode_READY(self) == -1)
10526 return NULL;
10527 if (PyUnicode_GET_LENGTH(self) == 0)
10528 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010529 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010530}
10531
Benjamin Petersond5890c82012-01-14 13:23:30 -050010532PyDoc_STRVAR(casefold__doc__,
10533 "S.casefold() -> str\n\
10534\n\
10535Return a version of S suitable for caseless comparisons.");
10536
10537static PyObject *
10538unicode_casefold(PyObject *self)
10539{
10540 if (PyUnicode_READY(self) == -1)
10541 return NULL;
10542 if (PyUnicode_IS_ASCII(self))
10543 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010544 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010545}
10546
10547
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010548/* Argument converter. Coerces to a single unicode character */
10549
10550static int
10551convert_uc(PyObject *obj, void *addr)
10552{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010553 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010554 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010555
Benjamin Peterson14339b62009-01-31 16:36:08 +000010556 uniobj = PyUnicode_FromObject(obj);
10557 if (uniobj == NULL) {
10558 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010559 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010560 return 0;
10561 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010562 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010563 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010564 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010565 Py_DECREF(uniobj);
10566 return 0;
10567 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010568 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010569 Py_DECREF(uniobj);
10570 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010571}
10572
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010573PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010574 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010575\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010576Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010577done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010578
10579static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010580unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010581{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010582 Py_ssize_t marg, left;
10583 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010584 Py_UCS4 fillchar = ' ';
10585
Victor Stinnere9a29352011-10-01 02:14:59 +020010586 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010587 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010588
Benjamin Petersonbac79492012-01-14 13:34:47 -050010589 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010590 return NULL;
10591
Victor Stinnerc4b49542011-12-11 22:44:26 +010010592 if (PyUnicode_GET_LENGTH(self) >= width)
10593 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010594
Victor Stinnerc4b49542011-12-11 22:44:26 +010010595 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010596 left = marg / 2 + (marg & width & 1);
10597
Victor Stinner9310abb2011-10-05 00:59:23 +020010598 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010599}
10600
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010601/* This function assumes that str1 and str2 are readied by the caller. */
10602
Marc-André Lemburge5034372000-08-08 08:04:29 +000010603static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010604unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010605{
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010606#define COMPARE(TYPE1, TYPE2) \
10607 do { \
10608 TYPE1* p1 = (TYPE1 *)data1; \
10609 TYPE2* p2 = (TYPE2 *)data2; \
10610 TYPE1* end = p1 + len; \
10611 Py_UCS4 c1, c2; \
10612 for (; p1 != end; p1++, p2++) { \
10613 c1 = *p1; \
10614 c2 = *p2; \
10615 if (c1 != c2) \
10616 return (c1 < c2) ? -1 : 1; \
10617 } \
10618 } \
10619 while (0)
10620
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010621 int kind1, kind2;
10622 void *data1, *data2;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010623 Py_ssize_t len1, len2, len;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010624
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010625 kind1 = PyUnicode_KIND(str1);
10626 kind2 = PyUnicode_KIND(str2);
10627 data1 = PyUnicode_DATA(str1);
10628 data2 = PyUnicode_DATA(str2);
10629 len1 = PyUnicode_GET_LENGTH(str1);
10630 len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner770e19e2012-10-04 22:59:45 +020010631 len = Py_MIN(len1, len2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010632
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010633 switch(kind1) {
10634 case PyUnicode_1BYTE_KIND:
10635 {
10636 switch(kind2) {
10637 case PyUnicode_1BYTE_KIND:
10638 {
10639 int cmp = memcmp(data1, data2, len);
10640 /* normalize result of memcmp() into the range [-1; 1] */
10641 if (cmp < 0)
10642 return -1;
10643 if (cmp > 0)
10644 return 1;
10645 break;
Victor Stinner770e19e2012-10-04 22:59:45 +020010646 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010647 case PyUnicode_2BYTE_KIND:
10648 COMPARE(Py_UCS1, Py_UCS2);
10649 break;
10650 case PyUnicode_4BYTE_KIND:
10651 COMPARE(Py_UCS1, Py_UCS4);
10652 break;
10653 default:
10654 assert(0);
10655 }
10656 break;
10657 }
10658 case PyUnicode_2BYTE_KIND:
10659 {
10660 switch(kind2) {
10661 case PyUnicode_1BYTE_KIND:
10662 COMPARE(Py_UCS2, Py_UCS1);
10663 break;
10664 case PyUnicode_2BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010665 {
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010666 COMPARE(Py_UCS2, Py_UCS2);
10667 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010668 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010669 case PyUnicode_4BYTE_KIND:
10670 COMPARE(Py_UCS2, Py_UCS4);
10671 break;
10672 default:
10673 assert(0);
10674 }
10675 break;
10676 }
10677 case PyUnicode_4BYTE_KIND:
10678 {
10679 switch(kind2) {
10680 case PyUnicode_1BYTE_KIND:
10681 COMPARE(Py_UCS4, Py_UCS1);
10682 break;
10683 case PyUnicode_2BYTE_KIND:
10684 COMPARE(Py_UCS4, Py_UCS2);
10685 break;
10686 case PyUnicode_4BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010687 {
10688#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
10689 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
10690 /* normalize result of wmemcmp() into the range [-1; 1] */
10691 if (cmp < 0)
10692 return -1;
10693 if (cmp > 0)
10694 return 1;
10695#else
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010696 COMPARE(Py_UCS4, Py_UCS4);
Victor Stinnercd777ea2013-04-08 22:43:44 +020010697#endif
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010698 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010699 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010700 default:
10701 assert(0);
10702 }
10703 break;
10704 }
10705 default:
10706 assert(0);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010707 }
10708
Victor Stinner770e19e2012-10-04 22:59:45 +020010709 if (len1 == len2)
10710 return 0;
10711 if (len1 < len2)
10712 return -1;
10713 else
10714 return 1;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010715
10716#undef COMPARE
Marc-André Lemburge5034372000-08-08 08:04:29 +000010717}
10718
Victor Stinnerc8bc5372013-11-04 11:08:10 +010010719Py_LOCAL(int)
Victor Stinnere5567ad2012-10-23 02:48:49 +020010720unicode_compare_eq(PyObject *str1, PyObject *str2)
10721{
10722 int kind;
10723 void *data1, *data2;
10724 Py_ssize_t len;
10725 int cmp;
10726
Victor Stinnere5567ad2012-10-23 02:48:49 +020010727 len = PyUnicode_GET_LENGTH(str1);
10728 if (PyUnicode_GET_LENGTH(str2) != len)
10729 return 0;
10730 kind = PyUnicode_KIND(str1);
10731 if (PyUnicode_KIND(str2) != kind)
10732 return 0;
10733 data1 = PyUnicode_DATA(str1);
10734 data2 = PyUnicode_DATA(str2);
10735
10736 cmp = memcmp(data1, data2, len * kind);
10737 return (cmp == 0);
10738}
10739
10740
Alexander Belopolsky40018472011-02-26 01:02:56 +000010741int
10742PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010743{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010744 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10745 if (PyUnicode_READY(left) == -1 ||
10746 PyUnicode_READY(right) == -1)
10747 return -1;
Victor Stinnerf0c7b2a2013-11-04 11:27:14 +010010748
10749 /* a string is equal to itself */
10750 if (left == right)
10751 return 0;
10752
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010753 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010754 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010755 PyErr_Format(PyExc_TypeError,
10756 "Can't compare %.100s and %.100s",
10757 left->ob_type->tp_name,
10758 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010759 return -1;
10760}
10761
Martin v. Löwis5b222132007-06-10 09:51:05 +000010762int
Victor Stinnerad14ccd2013-11-07 00:46:04 +010010763_PyUnicode_CompareWithId(PyObject *left, _Py_Identifier *right)
10764{
10765 PyObject *right_str = _PyUnicode_FromId(right); /* borrowed */
10766 if (right_str == NULL)
10767 return -1;
10768 return PyUnicode_Compare(left, right_str);
10769}
10770
10771int
Martin v. Löwis5b222132007-06-10 09:51:05 +000010772PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10773{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010774 Py_ssize_t i;
10775 int kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010776 Py_UCS4 chr;
10777
Victor Stinner910337b2011-10-03 03:20:16 +020010778 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010779 if (PyUnicode_READY(uni) == -1)
10780 return -1;
10781 kind = PyUnicode_KIND(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010010782 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnera6b9b072013-10-30 18:27:13 +010010783 const void *data = PyUnicode_1BYTE_DATA(uni);
Victor Stinnere1b15922013-11-03 13:53:12 +010010784 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010010785 size_t len, len2 = strlen(str);
10786 int cmp;
10787
10788 len = Py_MIN(len1, len2);
10789 cmp = memcmp(data, str, len);
Victor Stinner21ea21e2013-11-04 11:28:26 +010010790 if (cmp != 0) {
10791 if (cmp < 0)
10792 return -1;
10793 else
10794 return 1;
10795 }
Victor Stinner602f7cf2013-10-29 23:31:50 +010010796 if (len1 > len2)
10797 return 1; /* uni is longer */
10798 if (len2 > len1)
10799 return -1; /* str is longer */
10800 return 0;
10801 }
10802 else {
10803 void *data = PyUnicode_DATA(uni);
10804 /* Compare Unicode string and source character set string */
10805 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
10806 if (chr != str[i])
10807 return (chr < (unsigned char)(str[i])) ? -1 : 1;
10808 /* This check keeps Python strings that end in '\0' from comparing equal
10809 to C strings identical up to that point. */
10810 if (PyUnicode_GET_LENGTH(uni) != i || chr)
10811 return 1; /* uni is longer */
10812 if (str[i])
10813 return -1; /* str is longer */
10814 return 0;
10815 }
Martin v. Löwis5b222132007-06-10 09:51:05 +000010816}
10817
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010818
Benjamin Peterson29060642009-01-31 22:14:21 +000010819#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000010820 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010821
Alexander Belopolsky40018472011-02-26 01:02:56 +000010822PyObject *
10823PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010824{
10825 int result;
Victor Stinnere5567ad2012-10-23 02:48:49 +020010826 PyObject *v;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010827
Victor Stinnere5567ad2012-10-23 02:48:49 +020010828 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
10829 Py_RETURN_NOTIMPLEMENTED;
10830
10831 if (PyUnicode_READY(left) == -1 ||
10832 PyUnicode_READY(right) == -1)
10833 return NULL;
10834
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010010835 if (left == right) {
10836 switch (op) {
10837 case Py_EQ:
10838 case Py_LE:
10839 case Py_GE:
10840 /* a string is equal to itself */
10841 v = Py_True;
10842 break;
10843 case Py_NE:
10844 case Py_LT:
10845 case Py_GT:
10846 v = Py_False;
10847 break;
10848 default:
10849 PyErr_BadArgument();
10850 return NULL;
10851 }
10852 }
10853 else if (op == Py_EQ || op == Py_NE) {
Victor Stinnere5567ad2012-10-23 02:48:49 +020010854 result = unicode_compare_eq(left, right);
Victor Stinnerc8bc5372013-11-04 11:08:10 +010010855 result ^= (op == Py_NE);
10856 v = TEST_COND(result);
Victor Stinnere5567ad2012-10-23 02:48:49 +020010857 }
10858 else {
Victor Stinner90db9c42012-10-04 21:53:50 +020010859 result = unicode_compare(left, right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010860
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010861 /* Convert the return value to a Boolean */
10862 switch (op) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010863 case Py_LE:
10864 v = TEST_COND(result <= 0);
10865 break;
10866 case Py_GE:
10867 v = TEST_COND(result >= 0);
10868 break;
10869 case Py_LT:
10870 v = TEST_COND(result == -1);
10871 break;
10872 case Py_GT:
10873 v = TEST_COND(result == 1);
10874 break;
10875 default:
10876 PyErr_BadArgument();
10877 return NULL;
10878 }
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010879 }
Victor Stinnere5567ad2012-10-23 02:48:49 +020010880 Py_INCREF(v);
10881 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010882}
10883
Alexander Belopolsky40018472011-02-26 01:02:56 +000010884int
10885PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +000010886{
Thomas Wouters477c8d52006-05-27 19:21:47 +000010887 PyObject *str, *sub;
Victor Stinner77282cb2013-04-14 19:22:47 +020010888 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010889 void *buf1, *buf2;
10890 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010891 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010892
10893 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +000010894 sub = PyUnicode_FromObject(element);
10895 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010896 PyErr_Format(PyExc_TypeError,
10897 "'in <string>' requires string as left operand, not %s",
10898 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010899 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010900 }
10901
Thomas Wouters477c8d52006-05-27 19:21:47 +000010902 str = PyUnicode_FromObject(container);
Benjamin Peterson22a29702012-01-02 09:00:30 -060010903 if (!str) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010904 Py_DECREF(sub);
10905 return -1;
10906 }
10907
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010908 kind1 = PyUnicode_KIND(str);
10909 kind2 = PyUnicode_KIND(sub);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010910 buf1 = PyUnicode_DATA(str);
10911 buf2 = PyUnicode_DATA(sub);
Victor Stinner77282cb2013-04-14 19:22:47 +020010912 if (kind2 != kind1) {
10913 if (kind2 > kind1) {
Antoine Pitrou758153b2012-05-12 15:51:51 +020010914 Py_DECREF(sub);
10915 Py_DECREF(str);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010916 return 0;
Antoine Pitrou758153b2012-05-12 15:51:51 +020010917 }
Victor Stinner77282cb2013-04-14 19:22:47 +020010918 buf2 = _PyUnicode_AsKind(sub, kind1);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010919 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010920 if (!buf2) {
10921 Py_DECREF(sub);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010922 Py_DECREF(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010923 return -1;
10924 }
10925 len1 = PyUnicode_GET_LENGTH(str);
10926 len2 = PyUnicode_GET_LENGTH(sub);
10927
Victor Stinner77282cb2013-04-14 19:22:47 +020010928 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010929 case PyUnicode_1BYTE_KIND:
10930 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10931 break;
10932 case PyUnicode_2BYTE_KIND:
10933 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10934 break;
10935 case PyUnicode_4BYTE_KIND:
10936 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
10937 break;
10938 default:
10939 result = -1;
10940 assert(0);
10941 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010942
10943 Py_DECREF(str);
10944 Py_DECREF(sub);
10945
Victor Stinner77282cb2013-04-14 19:22:47 +020010946 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010947 PyMem_Free(buf2);
10948
Guido van Rossum403d68b2000-03-13 15:55:09 +000010949 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010950}
10951
Guido van Rossumd57fd912000-03-10 22:53:23 +000010952/* Concat to string or Unicode object giving a new Unicode object. */
10953
Alexander Belopolsky40018472011-02-26 01:02:56 +000010954PyObject *
10955PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010956{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010957 PyObject *u = NULL, *v = NULL, *w;
Victor Stinner127226b2011-10-13 01:12:34 +020010958 Py_UCS4 maxchar, maxchar2;
Victor Stinner488fa492011-12-12 00:01:39 +010010959 Py_ssize_t u_len, v_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010960
10961 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010962 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010963 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010964 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010965 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010966 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010967 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010968
10969 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +020010970 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010971 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010972 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010973 }
Victor Stinnera464fc12011-10-02 20:39:30 +020010974 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010975 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010976 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010977 }
10978
Victor Stinner488fa492011-12-12 00:01:39 +010010979 u_len = PyUnicode_GET_LENGTH(u);
10980 v_len = PyUnicode_GET_LENGTH(v);
10981 if (u_len > PY_SSIZE_T_MAX - v_len) {
10982 PyErr_SetString(PyExc_OverflowError,
10983 "strings are too large to concat");
10984 goto onError;
10985 }
10986 new_len = u_len + v_len;
10987
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010988 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinner127226b2011-10-13 01:12:34 +020010989 maxchar2 = PyUnicode_MAX_CHAR_VALUE(v);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010990 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010991
Guido van Rossumd57fd912000-03-10 22:53:23 +000010992 /* Concat the two Unicode strings */
Victor Stinner488fa492011-12-12 00:01:39 +010010993 w = PyUnicode_New(new_len, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010994 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010995 goto onError;
Victor Stinnerd3f08822012-05-29 12:57:52 +020010996 _PyUnicode_FastCopyCharacters(w, 0, u, 0, u_len);
10997 _PyUnicode_FastCopyCharacters(w, u_len, v, 0, v_len);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010998 Py_DECREF(u);
10999 Py_DECREF(v);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011000 assert(_PyUnicode_CheckConsistency(w, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011001 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011002
Benjamin Peterson29060642009-01-31 22:14:21 +000011003 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000011004 Py_XDECREF(u);
11005 Py_XDECREF(v);
11006 return NULL;
11007}
11008
Walter Dörwald1ab83302007-05-18 17:15:44 +000011009void
Victor Stinner23e56682011-10-03 03:54:37 +020011010PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000011011{
Victor Stinner23e56682011-10-03 03:54:37 +020011012 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010011013 Py_UCS4 maxchar, maxchar2;
11014 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020011015
11016 if (p_left == NULL) {
11017 if (!PyErr_Occurred())
11018 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000011019 return;
11020 }
Victor Stinner23e56682011-10-03 03:54:37 +020011021 left = *p_left;
Victor Stinnerf0335102013-04-14 19:13:03 +020011022 if (right == NULL || left == NULL
11023 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
Victor Stinner23e56682011-10-03 03:54:37 +020011024 if (!PyErr_Occurred())
11025 PyErr_BadInternalCall();
11026 goto error;
11027 }
11028
Benjamin Petersonbac79492012-01-14 13:34:47 -050011029 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011030 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050011031 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011032 goto error;
11033
Victor Stinner488fa492011-12-12 00:01:39 +010011034 /* Shortcuts */
11035 if (left == unicode_empty) {
11036 Py_DECREF(left);
11037 Py_INCREF(right);
11038 *p_left = right;
11039 return;
11040 }
11041 if (right == unicode_empty)
11042 return;
11043
11044 left_len = PyUnicode_GET_LENGTH(left);
11045 right_len = PyUnicode_GET_LENGTH(right);
11046 if (left_len > PY_SSIZE_T_MAX - right_len) {
11047 PyErr_SetString(PyExc_OverflowError,
11048 "strings are too large to concat");
11049 goto error;
11050 }
11051 new_len = left_len + right_len;
11052
11053 if (unicode_modifiable(left)
11054 && PyUnicode_CheckExact(right)
11055 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020011056 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11057 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020011058 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020011059 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010011060 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11061 {
11062 /* append inplace */
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011063 if (unicode_resize(p_left, new_len) != 0)
Victor Stinner488fa492011-12-12 00:01:39 +010011064 goto error;
Victor Stinnerf0335102013-04-14 19:13:03 +020011065
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011066 /* copy 'right' into the newly allocated area of 'left' */
11067 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020011068 }
Victor Stinner488fa492011-12-12 00:01:39 +010011069 else {
11070 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11071 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011072 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020011073
Victor Stinner488fa492011-12-12 00:01:39 +010011074 /* Concat the two Unicode strings */
11075 res = PyUnicode_New(new_len, maxchar);
11076 if (res == NULL)
11077 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020011078 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11079 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010011080 Py_DECREF(left);
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011081 *p_left = res;
Victor Stinner488fa492011-12-12 00:01:39 +010011082 }
11083 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020011084 return;
11085
11086error:
Victor Stinner488fa492011-12-12 00:01:39 +010011087 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011088}
11089
11090void
11091PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11092{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011093 PyUnicode_Append(pleft, right);
11094 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011095}
11096
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011097PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011098 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011099\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011100Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011101string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011102interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011103
11104static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011105unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011106{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011107 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011108 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011109 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011110 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011111 int kind1, kind2, kind;
11112 void *buf1, *buf2;
11113 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011114
Jesus Ceaac451502011-04-20 17:09:23 +020011115 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
11116 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011117 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000011118
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011119 kind1 = PyUnicode_KIND(self);
11120 kind2 = PyUnicode_KIND(substring);
Christian Heimesd47802e2013-06-29 21:33:36 +020011121 if (kind2 > kind1) {
11122 Py_DECREF(substring);
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040011123 return PyLong_FromLong(0);
Christian Heimesd47802e2013-06-29 21:33:36 +020011124 }
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040011125 kind = kind1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011126 buf1 = PyUnicode_DATA(self);
11127 buf2 = PyUnicode_DATA(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011128 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010011129 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011130 if (!buf2) {
11131 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011132 return NULL;
11133 }
11134 len1 = PyUnicode_GET_LENGTH(self);
11135 len2 = PyUnicode_GET_LENGTH(substring);
11136
11137 ADJUST_INDICES(start, end, len1);
Benjamin Petersonead6b532011-12-20 17:23:42 -060011138 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011139 case PyUnicode_1BYTE_KIND:
11140 iresult = ucs1lib_count(
11141 ((Py_UCS1*)buf1) + start, end - start,
11142 buf2, len2, PY_SSIZE_T_MAX
11143 );
11144 break;
11145 case PyUnicode_2BYTE_KIND:
11146 iresult = ucs2lib_count(
11147 ((Py_UCS2*)buf1) + start, end - start,
11148 buf2, len2, PY_SSIZE_T_MAX
11149 );
11150 break;
11151 case PyUnicode_4BYTE_KIND:
11152 iresult = ucs4lib_count(
11153 ((Py_UCS4*)buf1) + start, end - start,
11154 buf2, len2, PY_SSIZE_T_MAX
11155 );
11156 break;
11157 default:
11158 assert(0); iresult = 0;
11159 }
11160
11161 result = PyLong_FromSsize_t(iresult);
11162
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011163 if (kind2 != kind)
11164 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011165
11166 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011167
Guido van Rossumd57fd912000-03-10 22:53:23 +000011168 return result;
11169}
11170
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011171PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000011172 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011173\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000011174Encode S using the codec registered for encoding. Default encoding\n\
11175is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000011176handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000011177a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
11178'xmlcharrefreplace' as well as any other name registered with\n\
11179codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011180
11181static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011182unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011183{
Benjamin Peterson308d6372009-09-18 21:42:35 +000011184 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000011185 char *encoding = NULL;
11186 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000011187
Benjamin Peterson308d6372009-09-18 21:42:35 +000011188 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
11189 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011190 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011191 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011192}
11193
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011194PyDoc_STRVAR(expandtabs__doc__,
Ezio Melotti745d54d2013-11-16 19:10:57 +020011195 "S.expandtabs(tabsize=8) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011196\n\
11197Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011198If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011199
11200static PyObject*
Ezio Melotti745d54d2013-11-16 19:10:57 +020011201unicode_expandtabs(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011202{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011203 Py_ssize_t i, j, line_pos, src_len, incr;
11204 Py_UCS4 ch;
11205 PyObject *u;
11206 void *src_data, *dest_data;
Ezio Melotti745d54d2013-11-16 19:10:57 +020011207 static char *kwlist[] = {"tabsize", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000011208 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011209 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011210 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011211
Ezio Melotti745d54d2013-11-16 19:10:57 +020011212 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:expandtabs",
11213 kwlist, &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000011214 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011215
Antoine Pitrou22425222011-10-04 19:10:51 +020011216 if (PyUnicode_READY(self) == -1)
11217 return NULL;
11218
Thomas Wouters7e474022000-07-16 12:04:32 +000011219 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011220 src_len = PyUnicode_GET_LENGTH(self);
11221 i = j = line_pos = 0;
11222 kind = PyUnicode_KIND(self);
11223 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011224 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011225 for (; i < src_len; i++) {
11226 ch = PyUnicode_READ(kind, src_data, i);
11227 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011228 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011229 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011230 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011231 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011232 goto overflow;
11233 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011234 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011235 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011236 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011237 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011238 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011239 goto overflow;
11240 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011241 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011242 if (ch == '\n' || ch == '\r')
11243 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011244 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011245 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011246 if (!found)
11247 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011248
Guido van Rossumd57fd912000-03-10 22:53:23 +000011249 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011250 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011251 if (!u)
11252 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011253 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011254
Antoine Pitroue71d5742011-10-04 15:55:09 +020011255 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011256
Antoine Pitroue71d5742011-10-04 15:55:09 +020011257 for (; i < src_len; i++) {
11258 ch = PyUnicode_READ(kind, src_data, i);
11259 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011260 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011261 incr = tabsize - (line_pos % tabsize);
11262 line_pos += incr;
Victor Stinnerda79e632012-02-22 13:37:04 +010011263 FILL(kind, dest_data, ' ', j, incr);
11264 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011265 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011266 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011267 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011268 line_pos++;
11269 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011270 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011271 if (ch == '\n' || ch == '\r')
11272 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011273 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011274 }
11275 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011276 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011277
Antoine Pitroue71d5742011-10-04 15:55:09 +020011278 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011279 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11280 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011281}
11282
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011283PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011284 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011285\n\
11286Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011287such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011288arguments start and end are interpreted as in slice notation.\n\
11289\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011290Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011291
11292static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011293unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011294{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011295 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011296 Py_ssize_t start;
11297 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011298 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011299
Jesus Ceaac451502011-04-20 17:09:23 +020011300 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
11301 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011302 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011303
Christian Heimesd47802e2013-06-29 21:33:36 +020011304 if (PyUnicode_READY(self) == -1) {
11305 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011306 return NULL;
Christian Heimesd47802e2013-06-29 21:33:36 +020011307 }
11308 if (PyUnicode_READY(substring) == -1) {
11309 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011310 return NULL;
Christian Heimesd47802e2013-06-29 21:33:36 +020011311 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011312
Victor Stinner7931d9a2011-11-04 00:22:48 +010011313 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011314
11315 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011316
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011317 if (result == -2)
11318 return NULL;
11319
Christian Heimes217cfd12007-12-02 14:31:20 +000011320 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011321}
11322
11323static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011324unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011325{
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011326 void *data;
11327 enum PyUnicode_Kind kind;
11328 Py_UCS4 ch;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011329
11330 if (!PyUnicode_Check(self) || PyUnicode_READY(self) == -1) {
11331 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011332 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011333 }
11334 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11335 PyErr_SetString(PyExc_IndexError, "string index out of range");
11336 return NULL;
11337 }
11338 kind = PyUnicode_KIND(self);
11339 data = PyUnicode_DATA(self);
11340 ch = PyUnicode_READ(kind, data, index);
Victor Stinner985a82a2014-01-03 12:53:47 +010011341 return unicode_char(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011342}
11343
Guido van Rossumc2504932007-09-18 19:42:40 +000011344/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011345 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011346static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011347unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011348{
Guido van Rossumc2504932007-09-18 19:42:40 +000011349 Py_ssize_t len;
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080011350 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000011351
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011352#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011353 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011354#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011355 if (_PyUnicode_HASH(self) != -1)
11356 return _PyUnicode_HASH(self);
11357 if (PyUnicode_READY(self) == -1)
11358 return -1;
11359 len = PyUnicode_GET_LENGTH(self);
Georg Brandl16fa2a12012-02-21 00:50:13 +010011360 /*
11361 We make the hash of the empty string be 0, rather than using
11362 (prefix ^ suffix), since this slightly obfuscates the hash secret
11363 */
11364 if (len == 0) {
11365 _PyUnicode_HASH(self) = 0;
11366 return 0;
11367 }
Christian Heimes985ecdc2013-11-20 11:46:18 +010011368 x = _Py_HashBytes(PyUnicode_DATA(self),
11369 PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011370 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011371 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011372}
11373
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011374PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011375 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011376\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011377Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011378
11379static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011380unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011381{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011382 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011383 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011384 Py_ssize_t start;
11385 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011386
Jesus Ceaac451502011-04-20 17:09:23 +020011387 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
11388 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011389 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011390
Christian Heimesd47a0452013-06-29 21:21:37 +020011391 if (PyUnicode_READY(self) == -1) {
11392 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011393 return NULL;
Christian Heimesd47a0452013-06-29 21:21:37 +020011394 }
11395 if (PyUnicode_READY(substring) == -1) {
11396 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011397 return NULL;
Christian Heimesd47a0452013-06-29 21:21:37 +020011398 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011399
Victor Stinner7931d9a2011-11-04 00:22:48 +010011400 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011401
11402 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011403
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011404 if (result == -2)
11405 return NULL;
11406
Guido van Rossumd57fd912000-03-10 22:53:23 +000011407 if (result < 0) {
11408 PyErr_SetString(PyExc_ValueError, "substring not found");
11409 return NULL;
11410 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011411
Christian Heimes217cfd12007-12-02 14:31:20 +000011412 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011413}
11414
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011415PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011416 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011417\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011418Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011419at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011420
11421static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011422unicode_islower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011423{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011424 Py_ssize_t i, length;
11425 int kind;
11426 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011427 int cased;
11428
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011429 if (PyUnicode_READY(self) == -1)
11430 return NULL;
11431 length = PyUnicode_GET_LENGTH(self);
11432 kind = PyUnicode_KIND(self);
11433 data = PyUnicode_DATA(self);
11434
Guido van Rossumd57fd912000-03-10 22:53:23 +000011435 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011436 if (length == 1)
11437 return PyBool_FromLong(
11438 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011439
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011440 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011441 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011442 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011443
Guido van Rossumd57fd912000-03-10 22:53:23 +000011444 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011445 for (i = 0; i < length; i++) {
11446 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011447
Benjamin Peterson29060642009-01-31 22:14:21 +000011448 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11449 return PyBool_FromLong(0);
11450 else if (!cased && Py_UNICODE_ISLOWER(ch))
11451 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011452 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011453 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011454}
11455
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011456PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011457 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011458\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011459Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011460at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011461
11462static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011463unicode_isupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011464{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011465 Py_ssize_t i, length;
11466 int kind;
11467 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011468 int cased;
11469
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011470 if (PyUnicode_READY(self) == -1)
11471 return NULL;
11472 length = PyUnicode_GET_LENGTH(self);
11473 kind = PyUnicode_KIND(self);
11474 data = PyUnicode_DATA(self);
11475
Guido van Rossumd57fd912000-03-10 22:53:23 +000011476 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011477 if (length == 1)
11478 return PyBool_FromLong(
11479 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011480
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011481 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011482 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011483 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011484
Guido van Rossumd57fd912000-03-10 22:53:23 +000011485 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011486 for (i = 0; i < length; i++) {
11487 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011488
Benjamin Peterson29060642009-01-31 22:14:21 +000011489 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11490 return PyBool_FromLong(0);
11491 else if (!cased && Py_UNICODE_ISUPPER(ch))
11492 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011493 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011494 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011495}
11496
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011497PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011498 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011499\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011500Return True if S is a titlecased string and there is at least one\n\
11501character in S, i.e. upper- and titlecase characters may only\n\
11502follow uncased characters and lowercase characters only cased ones.\n\
11503Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011504
11505static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011506unicode_istitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011507{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011508 Py_ssize_t i, length;
11509 int kind;
11510 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011511 int cased, previous_is_cased;
11512
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011513 if (PyUnicode_READY(self) == -1)
11514 return NULL;
11515 length = PyUnicode_GET_LENGTH(self);
11516 kind = PyUnicode_KIND(self);
11517 data = PyUnicode_DATA(self);
11518
Guido van Rossumd57fd912000-03-10 22:53:23 +000011519 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011520 if (length == 1) {
11521 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11522 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11523 (Py_UNICODE_ISUPPER(ch) != 0));
11524 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011525
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011526 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011527 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011528 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011529
Guido van Rossumd57fd912000-03-10 22:53:23 +000011530 cased = 0;
11531 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011532 for (i = 0; i < length; i++) {
11533 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011534
Benjamin Peterson29060642009-01-31 22:14:21 +000011535 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11536 if (previous_is_cased)
11537 return PyBool_FromLong(0);
11538 previous_is_cased = 1;
11539 cased = 1;
11540 }
11541 else if (Py_UNICODE_ISLOWER(ch)) {
11542 if (!previous_is_cased)
11543 return PyBool_FromLong(0);
11544 previous_is_cased = 1;
11545 cased = 1;
11546 }
11547 else
11548 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011549 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011550 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011551}
11552
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011553PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011554 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011555\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011556Return True if all characters in S are whitespace\n\
11557and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011558
11559static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011560unicode_isspace(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011561{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011562 Py_ssize_t i, length;
11563 int kind;
11564 void *data;
11565
11566 if (PyUnicode_READY(self) == -1)
11567 return NULL;
11568 length = PyUnicode_GET_LENGTH(self);
11569 kind = PyUnicode_KIND(self);
11570 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011571
Guido van Rossumd57fd912000-03-10 22:53:23 +000011572 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011573 if (length == 1)
11574 return PyBool_FromLong(
11575 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011576
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011577 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011578 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011579 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011580
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011581 for (i = 0; i < length; i++) {
11582 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011583 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011584 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011585 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011586 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011587}
11588
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011589PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011590 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011591\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011592Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011593and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011594
11595static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011596unicode_isalpha(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011597{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011598 Py_ssize_t i, length;
11599 int kind;
11600 void *data;
11601
11602 if (PyUnicode_READY(self) == -1)
11603 return NULL;
11604 length = PyUnicode_GET_LENGTH(self);
11605 kind = PyUnicode_KIND(self);
11606 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011607
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011608 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011609 if (length == 1)
11610 return PyBool_FromLong(
11611 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011612
11613 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011614 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011615 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011616
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011617 for (i = 0; i < length; i++) {
11618 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011619 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011620 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011621 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011622}
11623
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011624PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011625 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011626\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011627Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011628and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011629
11630static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011631unicode_isalnum(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011632{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011633 int kind;
11634 void *data;
11635 Py_ssize_t len, i;
11636
11637 if (PyUnicode_READY(self) == -1)
11638 return NULL;
11639
11640 kind = PyUnicode_KIND(self);
11641 data = PyUnicode_DATA(self);
11642 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011643
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011644 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011645 if (len == 1) {
11646 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11647 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11648 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011649
11650 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011651 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011652 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011653
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011654 for (i = 0; i < len; i++) {
11655 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011656 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011657 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011658 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011659 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011660}
11661
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011662PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011663 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011664\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011665Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011666False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011667
11668static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011669unicode_isdecimal(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011670{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011671 Py_ssize_t i, length;
11672 int kind;
11673 void *data;
11674
11675 if (PyUnicode_READY(self) == -1)
11676 return NULL;
11677 length = PyUnicode_GET_LENGTH(self);
11678 kind = PyUnicode_KIND(self);
11679 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011680
Guido van Rossumd57fd912000-03-10 22:53:23 +000011681 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011682 if (length == 1)
11683 return PyBool_FromLong(
11684 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011685
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011686 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011687 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011688 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011689
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011690 for (i = 0; i < length; i++) {
11691 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011692 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011693 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011694 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011695}
11696
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011697PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011698 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011699\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011700Return True if all characters in S are digits\n\
11701and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011702
11703static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011704unicode_isdigit(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011705{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011706 Py_ssize_t i, length;
11707 int kind;
11708 void *data;
11709
11710 if (PyUnicode_READY(self) == -1)
11711 return NULL;
11712 length = PyUnicode_GET_LENGTH(self);
11713 kind = PyUnicode_KIND(self);
11714 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011715
Guido van Rossumd57fd912000-03-10 22:53:23 +000011716 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011717 if (length == 1) {
11718 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11719 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11720 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011721
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011722 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011723 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011724 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011725
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011726 for (i = 0; i < length; i++) {
11727 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011728 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011729 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011730 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011731}
11732
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011733PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011734 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011735\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011736Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011737False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011738
11739static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011740unicode_isnumeric(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011741{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011742 Py_ssize_t i, length;
11743 int kind;
11744 void *data;
11745
11746 if (PyUnicode_READY(self) == -1)
11747 return NULL;
11748 length = PyUnicode_GET_LENGTH(self);
11749 kind = PyUnicode_KIND(self);
11750 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011751
Guido van Rossumd57fd912000-03-10 22:53:23 +000011752 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011753 if (length == 1)
11754 return PyBool_FromLong(
11755 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011756
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011757 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011758 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011759 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011760
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011761 for (i = 0; i < length; i++) {
11762 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011763 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011764 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011765 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011766}
11767
Martin v. Löwis47383402007-08-15 07:32:56 +000011768int
11769PyUnicode_IsIdentifier(PyObject *self)
11770{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011771 int kind;
11772 void *data;
11773 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011774 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000011775
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011776 if (PyUnicode_READY(self) == -1) {
11777 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000011778 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011779 }
11780
11781 /* Special case for empty strings */
11782 if (PyUnicode_GET_LENGTH(self) == 0)
11783 return 0;
11784 kind = PyUnicode_KIND(self);
11785 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000011786
11787 /* PEP 3131 says that the first character must be in
11788 XID_Start and subsequent characters in XID_Continue,
11789 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000011790 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000011791 letters, digits, underscore). However, given the current
11792 definition of XID_Start and XID_Continue, it is sufficient
11793 to check just for these, except that _ must be allowed
11794 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011795 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050011796 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000011797 return 0;
11798
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040011799 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011800 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011801 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000011802 return 1;
11803}
11804
11805PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011806 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000011807\n\
11808Return True if S is a valid identifier according\n\
Raymond Hettinger378170d2013-03-23 08:21:12 -070011809to the language definition.\n\
11810\n\
11811Use keyword.iskeyword() to test for reserved identifiers\n\
11812such as \"def\" and \"class\".\n");
Martin v. Löwis47383402007-08-15 07:32:56 +000011813
11814static PyObject*
11815unicode_isidentifier(PyObject *self)
11816{
11817 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11818}
11819
Georg Brandl559e5d72008-06-11 18:37:52 +000011820PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011821 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000011822\n\
11823Return True if all characters in S are considered\n\
11824printable in repr() or S is empty, False otherwise.");
11825
11826static PyObject*
11827unicode_isprintable(PyObject *self)
11828{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011829 Py_ssize_t i, length;
11830 int kind;
11831 void *data;
11832
11833 if (PyUnicode_READY(self) == -1)
11834 return NULL;
11835 length = PyUnicode_GET_LENGTH(self);
11836 kind = PyUnicode_KIND(self);
11837 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000011838
11839 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011840 if (length == 1)
11841 return PyBool_FromLong(
11842 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000011843
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011844 for (i = 0; i < length; i++) {
11845 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011846 Py_RETURN_FALSE;
11847 }
11848 }
11849 Py_RETURN_TRUE;
11850}
11851
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011852PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000011853 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011854\n\
11855Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000011856iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011857
11858static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011859unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011860{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011861 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011862}
11863
Martin v. Löwis18e16552006-02-15 17:27:45 +000011864static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011865unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011866{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011867 if (PyUnicode_READY(self) == -1)
11868 return -1;
11869 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011870}
11871
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011872PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011873 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011874\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011875Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011876done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011877
11878static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011879unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011880{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011881 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011882 Py_UCS4 fillchar = ' ';
11883
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011884 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011885 return NULL;
11886
Benjamin Petersonbac79492012-01-14 13:34:47 -050011887 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010011888 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011889
Victor Stinnerc4b49542011-12-11 22:44:26 +010011890 if (PyUnicode_GET_LENGTH(self) >= width)
11891 return unicode_result_unchanged(self);
11892
11893 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011894}
11895
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011896PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011897 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011898\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011899Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011900
11901static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011902unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011903{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050011904 if (PyUnicode_READY(self) == -1)
11905 return NULL;
11906 if (PyUnicode_IS_ASCII(self))
11907 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010011908 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011909}
11910
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011911#define LEFTSTRIP 0
11912#define RIGHTSTRIP 1
11913#define BOTHSTRIP 2
11914
11915/* Arrays indexed by above */
11916static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
11917
11918#define STRIPNAME(i) (stripformat[i]+3)
11919
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011920/* externally visible for str.strip(unicode) */
11921PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011922_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011923{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011924 void *data;
11925 int kind;
11926 Py_ssize_t i, j, len;
11927 BLOOM_MASK sepmask;
Victor Stinnerb3a60142013-04-09 22:19:21 +020011928 Py_ssize_t seplen;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011929
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011930 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
11931 return NULL;
11932
11933 kind = PyUnicode_KIND(self);
11934 data = PyUnicode_DATA(self);
11935 len = PyUnicode_GET_LENGTH(self);
Victor Stinnerb3a60142013-04-09 22:19:21 +020011936 seplen = PyUnicode_GET_LENGTH(sepobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011937 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11938 PyUnicode_DATA(sepobj),
Victor Stinnerb3a60142013-04-09 22:19:21 +020011939 seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011940
Benjamin Peterson14339b62009-01-31 16:36:08 +000011941 i = 0;
11942 if (striptype != RIGHTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020011943 while (i < len) {
11944 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11945 if (!BLOOM(sepmask, ch))
11946 break;
11947 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
11948 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000011949 i++;
11950 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011951 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011952
Benjamin Peterson14339b62009-01-31 16:36:08 +000011953 j = len;
11954 if (striptype != LEFTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020011955 j--;
11956 while (j >= i) {
11957 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
11958 if (!BLOOM(sepmask, ch))
11959 break;
11960 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
11961 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000011962 j--;
Victor Stinnerb3a60142013-04-09 22:19:21 +020011963 }
11964
Benjamin Peterson29060642009-01-31 22:14:21 +000011965 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011966 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011967
Victor Stinner7931d9a2011-11-04 00:22:48 +010011968 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011969}
11970
11971PyObject*
11972PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
11973{
11974 unsigned char *data;
11975 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020011976 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011977
Victor Stinnerde636f32011-10-01 03:55:54 +020011978 if (PyUnicode_READY(self) == -1)
11979 return NULL;
11980
Victor Stinner684d5fd2012-05-03 02:32:34 +020011981 length = PyUnicode_GET_LENGTH(self);
11982 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020011983
Victor Stinner684d5fd2012-05-03 02:32:34 +020011984 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010011985 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011986
Victor Stinnerde636f32011-10-01 03:55:54 +020011987 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011988 PyErr_SetString(PyExc_IndexError, "string index out of range");
11989 return NULL;
11990 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020011991 if (start >= length || end < start)
11992 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner12bab6d2011-10-01 01:53:49 +020011993
Victor Stinner684d5fd2012-05-03 02:32:34 +020011994 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020011995 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020011996 data = PyUnicode_1BYTE_DATA(self);
Victor Stinnerd3f08822012-05-29 12:57:52 +020011997 return _PyUnicode_FromASCII((char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020011998 }
11999 else {
12000 kind = PyUnicode_KIND(self);
12001 data = PyUnicode_1BYTE_DATA(self);
12002 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012003 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020012004 length);
12005 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012006}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012007
12008static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012009do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012010{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012011 Py_ssize_t len, i, j;
12012
12013 if (PyUnicode_READY(self) == -1)
12014 return NULL;
12015
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012016 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012017
Victor Stinnercc7af722013-04-09 22:39:24 +020012018 if (PyUnicode_IS_ASCII(self)) {
12019 Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12020
12021 i = 0;
12022 if (striptype != RIGHTSTRIP) {
12023 while (i < len) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012024 Py_UCS1 ch = data[i];
Victor Stinnercc7af722013-04-09 22:39:24 +020012025 if (!_Py_ascii_whitespace[ch])
12026 break;
12027 i++;
12028 }
12029 }
12030
12031 j = len;
12032 if (striptype != LEFTSTRIP) {
12033 j--;
12034 while (j >= i) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012035 Py_UCS1 ch = data[j];
Victor Stinnercc7af722013-04-09 22:39:24 +020012036 if (!_Py_ascii_whitespace[ch])
12037 break;
12038 j--;
12039 }
12040 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012041 }
12042 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012043 else {
12044 int kind = PyUnicode_KIND(self);
12045 void *data = PyUnicode_DATA(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012046
Victor Stinnercc7af722013-04-09 22:39:24 +020012047 i = 0;
12048 if (striptype != RIGHTSTRIP) {
12049 while (i < len) {
12050 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12051 if (!Py_UNICODE_ISSPACE(ch))
12052 break;
12053 i++;
12054 }
Victor Stinner9c79e412013-04-09 22:21:08 +020012055 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012056
12057 j = len;
12058 if (striptype != LEFTSTRIP) {
12059 j--;
12060 while (j >= i) {
12061 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12062 if (!Py_UNICODE_ISSPACE(ch))
12063 break;
12064 j--;
12065 }
12066 j++;
12067 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012068 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012069
Victor Stinner7931d9a2011-11-04 00:22:48 +010012070 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012071}
12072
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012073
12074static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012075do_argstrip(PyObject *self, int striptype, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012076{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012077 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012078
Serhiy Storchakac6792272013-10-19 21:03:34 +030012079 if (!PyArg_ParseTuple(args, stripformat[striptype], &sep))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012080 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012081
Benjamin Peterson14339b62009-01-31 16:36:08 +000012082 if (sep != NULL && sep != Py_None) {
12083 if (PyUnicode_Check(sep))
12084 return _PyUnicode_XStrip(self, striptype, sep);
12085 else {
12086 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012087 "%s arg must be None or str",
12088 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012089 return NULL;
12090 }
12091 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012092
Benjamin Peterson14339b62009-01-31 16:36:08 +000012093 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012094}
12095
12096
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012097PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012098 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012099\n\
12100Return a copy of the string S with leading and trailing\n\
12101whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012102If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012103
12104static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012105unicode_strip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012106{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012107 if (PyTuple_GET_SIZE(args) == 0)
12108 return do_strip(self, BOTHSTRIP); /* Common case */
12109 else
12110 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012111}
12112
12113
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012114PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012115 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012116\n\
12117Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012118If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012119
12120static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012121unicode_lstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012122{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012123 if (PyTuple_GET_SIZE(args) == 0)
12124 return do_strip(self, LEFTSTRIP); /* Common case */
12125 else
12126 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012127}
12128
12129
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012130PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012131 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012132\n\
12133Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012134If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012135
12136static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012137unicode_rstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012138{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012139 if (PyTuple_GET_SIZE(args) == 0)
12140 return do_strip(self, RIGHTSTRIP); /* Common case */
12141 else
12142 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012143}
12144
12145
Guido van Rossumd57fd912000-03-10 22:53:23 +000012146static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012147unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012148{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012149 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012150 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012151
Serhiy Storchaka05997252013-01-26 12:14:02 +020012152 if (len < 1)
12153 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012154
Victor Stinnerc4b49542011-12-11 22:44:26 +010012155 /* no repeat, return original string */
12156 if (len == 1)
12157 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000012158
Benjamin Petersonbac79492012-01-14 13:34:47 -050012159 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012160 return NULL;
12161
Victor Stinnerc759f3e2011-10-01 03:09:58 +020012162 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020012163 PyErr_SetString(PyExc_OverflowError,
12164 "repeated string is too long");
12165 return NULL;
12166 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012167 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012168
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012169 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012170 if (!u)
12171 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012172 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012173
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012174 if (PyUnicode_GET_LENGTH(str) == 1) {
12175 const int kind = PyUnicode_KIND(str);
12176 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010012177 if (kind == PyUnicode_1BYTE_KIND) {
12178 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012179 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010012180 }
12181 else if (kind == PyUnicode_2BYTE_KIND) {
12182 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012183 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010012184 ucs2[n] = fill_char;
12185 } else {
12186 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12187 assert(kind == PyUnicode_4BYTE_KIND);
12188 for (n = 0; n < len; ++n)
12189 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012190 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012191 }
12192 else {
12193 /* number of characters copied this far */
12194 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012195 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012196 char *to = (char *) PyUnicode_DATA(u);
12197 Py_MEMCPY(to, PyUnicode_DATA(str),
12198 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000012199 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012200 n = (done <= nchars-done) ? done : nchars-done;
12201 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012202 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000012203 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012204 }
12205
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012206 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012207 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012208}
12209
Alexander Belopolsky40018472011-02-26 01:02:56 +000012210PyObject *
12211PyUnicode_Replace(PyObject *obj,
12212 PyObject *subobj,
12213 PyObject *replobj,
12214 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012215{
12216 PyObject *self;
12217 PyObject *str1;
12218 PyObject *str2;
12219 PyObject *result;
12220
12221 self = PyUnicode_FromObject(obj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012222 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012223 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012224 str1 = PyUnicode_FromObject(subobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012225 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012226 Py_DECREF(self);
12227 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012228 }
12229 str2 = PyUnicode_FromObject(replobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012230 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012231 Py_DECREF(self);
12232 Py_DECREF(str1);
12233 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012234 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060012235 if (PyUnicode_READY(self) == -1 ||
12236 PyUnicode_READY(str1) == -1 ||
12237 PyUnicode_READY(str2) == -1)
12238 result = NULL;
12239 else
12240 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012241 Py_DECREF(self);
12242 Py_DECREF(str1);
12243 Py_DECREF(str2);
12244 return result;
12245}
12246
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012247PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000012248 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012249\n\
12250Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000012251old replaced by new. If the optional argument count is\n\
12252given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012253
12254static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012255unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012256{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012257 PyObject *str1;
12258 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012259 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012260 PyObject *result;
12261
Martin v. Löwis18e16552006-02-15 17:27:45 +000012262 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012263 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060012264 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012265 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012266 str1 = PyUnicode_FromObject(str1);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012267 if (str1 == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012268 return NULL;
12269 str2 = PyUnicode_FromObject(str2);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012270 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012271 Py_DECREF(str1);
12272 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000012273 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060012274 if (PyUnicode_READY(str1) == -1 || PyUnicode_READY(str2) == -1)
12275 result = NULL;
12276 else
12277 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012278
12279 Py_DECREF(str1);
12280 Py_DECREF(str2);
12281 return result;
12282}
12283
Alexander Belopolsky40018472011-02-26 01:02:56 +000012284static PyObject *
12285unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012286{
Walter Dörwald79e913e2007-05-12 11:08:06 +000012287 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012288 Py_ssize_t isize;
12289 Py_ssize_t osize, squote, dquote, i, o;
12290 Py_UCS4 max, quote;
Victor Stinner55c08782013-04-14 18:45:39 +020012291 int ikind, okind, unchanged;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012292 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012293
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012294 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012295 return NULL;
12296
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012297 isize = PyUnicode_GET_LENGTH(unicode);
12298 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012299
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012300 /* Compute length of output, quote characters, and
12301 maximum character */
Victor Stinner55c08782013-04-14 18:45:39 +020012302 osize = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012303 max = 127;
12304 squote = dquote = 0;
12305 ikind = PyUnicode_KIND(unicode);
12306 for (i = 0; i < isize; i++) {
12307 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12308 switch (ch) {
12309 case '\'': squote++; osize++; break;
12310 case '"': dquote++; osize++; break;
12311 case '\\': case '\t': case '\r': case '\n':
12312 osize += 2; break;
12313 default:
12314 /* Fast-path ASCII */
12315 if (ch < ' ' || ch == 0x7f)
12316 osize += 4; /* \xHH */
12317 else if (ch < 0x7f)
12318 osize++;
12319 else if (Py_UNICODE_ISPRINTABLE(ch)) {
12320 osize++;
12321 max = ch > max ? ch : max;
12322 }
12323 else if (ch < 0x100)
12324 osize += 4; /* \xHH */
12325 else if (ch < 0x10000)
12326 osize += 6; /* \uHHHH */
12327 else
12328 osize += 10; /* \uHHHHHHHH */
12329 }
12330 }
12331
12332 quote = '\'';
Victor Stinner55c08782013-04-14 18:45:39 +020012333 unchanged = (osize == isize);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012334 if (squote) {
Victor Stinner55c08782013-04-14 18:45:39 +020012335 unchanged = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012336 if (dquote)
12337 /* Both squote and dquote present. Use squote,
12338 and escape them */
12339 osize += squote;
12340 else
12341 quote = '"';
12342 }
Victor Stinner55c08782013-04-14 18:45:39 +020012343 osize += 2; /* quotes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012344
12345 repr = PyUnicode_New(osize, max);
12346 if (repr == NULL)
12347 return NULL;
12348 okind = PyUnicode_KIND(repr);
12349 odata = PyUnicode_DATA(repr);
12350
12351 PyUnicode_WRITE(okind, odata, 0, quote);
12352 PyUnicode_WRITE(okind, odata, osize-1, quote);
Victor Stinner55c08782013-04-14 18:45:39 +020012353 if (unchanged) {
12354 _PyUnicode_FastCopyCharacters(repr, 1,
12355 unicode, 0,
12356 isize);
12357 }
12358 else {
12359 for (i = 0, o = 1; i < isize; i++) {
12360 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012361
Victor Stinner55c08782013-04-14 18:45:39 +020012362 /* Escape quotes and backslashes */
12363 if ((ch == quote) || (ch == '\\')) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012364 PyUnicode_WRITE(okind, odata, o++, '\\');
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012365 PyUnicode_WRITE(okind, odata, o++, ch);
Victor Stinner55c08782013-04-14 18:45:39 +020012366 continue;
12367 }
12368
12369 /* Map special whitespace to '\t', \n', '\r' */
12370 if (ch == '\t') {
12371 PyUnicode_WRITE(okind, odata, o++, '\\');
12372 PyUnicode_WRITE(okind, odata, o++, 't');
12373 }
12374 else if (ch == '\n') {
12375 PyUnicode_WRITE(okind, odata, o++, '\\');
12376 PyUnicode_WRITE(okind, odata, o++, 'n');
12377 }
12378 else if (ch == '\r') {
12379 PyUnicode_WRITE(okind, odata, o++, '\\');
12380 PyUnicode_WRITE(okind, odata, o++, 'r');
12381 }
12382
12383 /* Map non-printable US ASCII to '\xhh' */
12384 else if (ch < ' ' || ch == 0x7F) {
12385 PyUnicode_WRITE(okind, odata, o++, '\\');
12386 PyUnicode_WRITE(okind, odata, o++, 'x');
12387 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12388 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12389 }
12390
12391 /* Copy ASCII characters as-is */
12392 else if (ch < 0x7F) {
12393 PyUnicode_WRITE(okind, odata, o++, ch);
12394 }
12395
12396 /* Non-ASCII characters */
12397 else {
12398 /* Map Unicode whitespace and control characters
12399 (categories Z* and C* except ASCII space)
12400 */
12401 if (!Py_UNICODE_ISPRINTABLE(ch)) {
12402 PyUnicode_WRITE(okind, odata, o++, '\\');
12403 /* Map 8-bit characters to '\xhh' */
12404 if (ch <= 0xff) {
12405 PyUnicode_WRITE(okind, odata, o++, 'x');
12406 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12407 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12408 }
12409 /* Map 16-bit characters to '\uxxxx' */
12410 else if (ch <= 0xffff) {
12411 PyUnicode_WRITE(okind, odata, o++, 'u');
12412 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12413 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12414 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12415 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12416 }
12417 /* Map 21-bit characters to '\U00xxxxxx' */
12418 else {
12419 PyUnicode_WRITE(okind, odata, o++, 'U');
12420 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12421 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12422 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12423 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12424 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12425 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12426 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12427 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12428 }
12429 }
12430 /* Copy characters as-is */
12431 else {
12432 PyUnicode_WRITE(okind, odata, o++, ch);
12433 }
Georg Brandl559e5d72008-06-11 18:37:52 +000012434 }
12435 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012436 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012437 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012438 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012439 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012440}
12441
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012442PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012443 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012444\n\
12445Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012446such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012447arguments start and end are interpreted as in slice notation.\n\
12448\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012449Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012450
12451static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012452unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012453{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012454 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012455 Py_ssize_t start;
12456 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012457 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012458
Jesus Ceaac451502011-04-20 17:09:23 +020012459 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
12460 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012461 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012462
Christian Heimesea71a522013-06-29 21:17:34 +020012463 if (PyUnicode_READY(self) == -1) {
12464 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012465 return NULL;
Christian Heimesea71a522013-06-29 21:17:34 +020012466 }
12467 if (PyUnicode_READY(substring) == -1) {
12468 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012469 return NULL;
Christian Heimesea71a522013-06-29 21:17:34 +020012470 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012471
Victor Stinner7931d9a2011-11-04 00:22:48 +010012472 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012473
12474 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012475
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012476 if (result == -2)
12477 return NULL;
12478
Christian Heimes217cfd12007-12-02 14:31:20 +000012479 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012480}
12481
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012482PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012483 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012484\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012485Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012486
12487static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012488unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012489{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012490 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012491 Py_ssize_t start;
12492 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012493 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012494
Jesus Ceaac451502011-04-20 17:09:23 +020012495 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
12496 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012497 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012498
Christian Heimesea71a522013-06-29 21:17:34 +020012499 if (PyUnicode_READY(self) == -1) {
12500 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012501 return NULL;
Christian Heimesea71a522013-06-29 21:17:34 +020012502 }
12503 if (PyUnicode_READY(substring) == -1) {
12504 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012505 return NULL;
Christian Heimesea71a522013-06-29 21:17:34 +020012506 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012507
Victor Stinner7931d9a2011-11-04 00:22:48 +010012508 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012509
12510 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012511
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012512 if (result == -2)
12513 return NULL;
12514
Guido van Rossumd57fd912000-03-10 22:53:23 +000012515 if (result < 0) {
12516 PyErr_SetString(PyExc_ValueError, "substring not found");
12517 return NULL;
12518 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012519
Christian Heimes217cfd12007-12-02 14:31:20 +000012520 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012521}
12522
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012523PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012524 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012525\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012526Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012527done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012528
12529static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012530unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012531{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012532 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012533 Py_UCS4 fillchar = ' ';
12534
Victor Stinnere9a29352011-10-01 02:14:59 +020012535 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012536 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012537
Benjamin Petersonbac79492012-01-14 13:34:47 -050012538 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012539 return NULL;
12540
Victor Stinnerc4b49542011-12-11 22:44:26 +010012541 if (PyUnicode_GET_LENGTH(self) >= width)
12542 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012543
Victor Stinnerc4b49542011-12-11 22:44:26 +010012544 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012545}
12546
Alexander Belopolsky40018472011-02-26 01:02:56 +000012547PyObject *
12548PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012549{
12550 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000012551
Guido van Rossumd57fd912000-03-10 22:53:23 +000012552 s = PyUnicode_FromObject(s);
12553 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012554 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012555 if (sep != NULL) {
12556 sep = PyUnicode_FromObject(sep);
12557 if (sep == NULL) {
12558 Py_DECREF(s);
12559 return NULL;
12560 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012561 }
12562
Victor Stinner9310abb2011-10-05 00:59:23 +020012563 result = split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012564
12565 Py_DECREF(s);
12566 Py_XDECREF(sep);
12567 return result;
12568}
12569
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012570PyDoc_STRVAR(split__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012571 "S.split(sep=None, maxsplit=-1) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012572\n\
12573Return a list of the words in S, using sep as the\n\
12574delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000012575splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000012576whitespace string is a separator and empty strings are\n\
12577removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012578
12579static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012580unicode_split(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012581{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012582 static char *kwlist[] = {"sep", "maxsplit", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000012583 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012584 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012585
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012586 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:split",
12587 kwlist, &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012588 return NULL;
12589
12590 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012591 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012592 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012593 return split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012594 else
Victor Stinner7931d9a2011-11-04 00:22:48 +010012595 return PyUnicode_Split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012596}
12597
Thomas Wouters477c8d52006-05-27 19:21:47 +000012598PyObject *
12599PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
12600{
12601 PyObject* str_obj;
12602 PyObject* sep_obj;
12603 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012604 int kind1, kind2, kind;
12605 void *buf1 = NULL, *buf2 = NULL;
12606 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012607
12608 str_obj = PyUnicode_FromObject(str_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012609 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012610 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012611 sep_obj = PyUnicode_FromObject(sep_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012612 if (!sep_obj) {
12613 Py_DECREF(str_obj);
12614 return NULL;
12615 }
12616 if (PyUnicode_READY(sep_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
12617 Py_DECREF(sep_obj);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012618 Py_DECREF(str_obj);
12619 return NULL;
12620 }
12621
Victor Stinner14f8f022011-10-05 20:58:25 +020012622 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012623 kind2 = PyUnicode_KIND(sep_obj);
Victor Stinner14f8f022011-10-05 20:58:25 +020012624 kind = Py_MAX(kind1, kind2);
12625 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012626 if (kind1 != kind)
Victor Stinner14f8f022011-10-05 20:58:25 +020012627 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012628 if (!buf1)
12629 goto onError;
12630 buf2 = PyUnicode_DATA(sep_obj);
12631 if (kind2 != kind)
12632 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12633 if (!buf2)
12634 goto onError;
12635 len1 = PyUnicode_GET_LENGTH(str_obj);
12636 len2 = PyUnicode_GET_LENGTH(sep_obj);
12637
Benjamin Petersonead6b532011-12-20 17:23:42 -060012638 switch (PyUnicode_KIND(str_obj)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012639 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012640 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12641 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12642 else
12643 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012644 break;
12645 case PyUnicode_2BYTE_KIND:
12646 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12647 break;
12648 case PyUnicode_4BYTE_KIND:
12649 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12650 break;
12651 default:
12652 assert(0);
12653 out = 0;
12654 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012655
12656 Py_DECREF(sep_obj);
12657 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012658 if (kind1 != kind)
12659 PyMem_Free(buf1);
12660 if (kind2 != kind)
12661 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012662
12663 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012664 onError:
12665 Py_DECREF(sep_obj);
12666 Py_DECREF(str_obj);
12667 if (kind1 != kind && buf1)
12668 PyMem_Free(buf1);
12669 if (kind2 != kind && buf2)
12670 PyMem_Free(buf2);
12671 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012672}
12673
12674
12675PyObject *
12676PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
12677{
12678 PyObject* str_obj;
12679 PyObject* sep_obj;
12680 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012681 int kind1, kind2, kind;
12682 void *buf1 = NULL, *buf2 = NULL;
12683 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012684
12685 str_obj = PyUnicode_FromObject(str_in);
12686 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012687 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012688 sep_obj = PyUnicode_FromObject(sep_in);
12689 if (!sep_obj) {
12690 Py_DECREF(str_obj);
12691 return NULL;
12692 }
12693
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012694 kind1 = PyUnicode_KIND(str_in);
12695 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020012696 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012697 buf1 = PyUnicode_DATA(str_in);
12698 if (kind1 != kind)
12699 buf1 = _PyUnicode_AsKind(str_in, kind);
12700 if (!buf1)
12701 goto onError;
12702 buf2 = PyUnicode_DATA(sep_obj);
12703 if (kind2 != kind)
12704 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12705 if (!buf2)
12706 goto onError;
12707 len1 = PyUnicode_GET_LENGTH(str_obj);
12708 len2 = PyUnicode_GET_LENGTH(sep_obj);
12709
Benjamin Petersonead6b532011-12-20 17:23:42 -060012710 switch (PyUnicode_KIND(str_in)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012711 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012712 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12713 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12714 else
12715 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012716 break;
12717 case PyUnicode_2BYTE_KIND:
12718 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12719 break;
12720 case PyUnicode_4BYTE_KIND:
12721 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12722 break;
12723 default:
12724 assert(0);
12725 out = 0;
12726 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012727
12728 Py_DECREF(sep_obj);
12729 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012730 if (kind1 != kind)
12731 PyMem_Free(buf1);
12732 if (kind2 != kind)
12733 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012734
12735 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012736 onError:
12737 Py_DECREF(sep_obj);
12738 Py_DECREF(str_obj);
12739 if (kind1 != kind && buf1)
12740 PyMem_Free(buf1);
12741 if (kind2 != kind && buf2)
12742 PyMem_Free(buf2);
12743 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012744}
12745
12746PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012747 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012748\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012749Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012750the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012751found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012752
12753static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012754unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012755{
Victor Stinner9310abb2011-10-05 00:59:23 +020012756 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012757}
12758
12759PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000012760 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012761\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012762Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012763the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012764separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012765
12766static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012767unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012768{
Victor Stinner9310abb2011-10-05 00:59:23 +020012769 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012770}
12771
Alexander Belopolsky40018472011-02-26 01:02:56 +000012772PyObject *
12773PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012774{
12775 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012776
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012777 s = PyUnicode_FromObject(s);
12778 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012779 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012780 if (sep != NULL) {
12781 sep = PyUnicode_FromObject(sep);
12782 if (sep == NULL) {
12783 Py_DECREF(s);
12784 return NULL;
12785 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012786 }
12787
Victor Stinner9310abb2011-10-05 00:59:23 +020012788 result = rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012789
12790 Py_DECREF(s);
12791 Py_XDECREF(sep);
12792 return result;
12793}
12794
12795PyDoc_STRVAR(rsplit__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012796 "S.rsplit(sep=None, maxsplit=-1) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012797\n\
12798Return a list of the words in S, using sep as the\n\
12799delimiter string, starting at the end of the string and\n\
12800working to the front. If maxsplit is given, at most maxsplit\n\
12801splits are done. If sep is not specified, any whitespace string\n\
12802is a separator.");
12803
12804static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012805unicode_rsplit(PyObject *self, PyObject *args, PyObject *kwds)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012806{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012807 static char *kwlist[] = {"sep", "maxsplit", 0};
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012808 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012809 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012810
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012811 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:rsplit",
12812 kwlist, &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012813 return NULL;
12814
12815 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012816 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012817 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012818 return rsplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012819 else
Victor Stinner9310abb2011-10-05 00:59:23 +020012820 return PyUnicode_RSplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012821}
12822
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012823PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012824 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012825\n\
12826Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000012827Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012828is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012829
12830static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012831unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012832{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012833 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000012834 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012835
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012836 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12837 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012838 return NULL;
12839
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012840 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012841}
12842
12843static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000012844PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012845{
Victor Stinnerc4b49542011-12-11 22:44:26 +010012846 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012847}
12848
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012849PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012850 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012851\n\
12852Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012853and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012854
12855static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012856unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012857{
Benjamin Petersoneea48462012-01-16 14:28:50 -050012858 if (PyUnicode_READY(self) == -1)
12859 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012860 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012861}
12862
Larry Hastings61272b72014-01-07 12:41:53 -080012863/*[clinic input]
Georg Brandlceee0772007-11-27 23:48:05 +000012864
Larry Hastings31826802013-10-19 00:09:25 -070012865@staticmethod
12866str.maketrans as unicode_maketrans
12867
12868 x: object
12869
12870 y: unicode=NULL
12871
12872 z: unicode=NULL
12873
12874 /
12875
12876Return a translation table usable for str.translate().
12877
12878If there is only one argument, it must be a dictionary mapping Unicode
12879ordinals (integers) or characters to Unicode ordinals, strings or None.
12880Character keys will be then converted to ordinals.
12881If there are two arguments, they must be strings of equal length, and
12882in the resulting dictionary, each character in x will be mapped to the
12883character at the same position in y. If there is a third argument, it
12884must be a string, whose characters will be mapped to None in the result.
Larry Hastings61272b72014-01-07 12:41:53 -080012885[clinic start generated code]*/
Larry Hastings31826802013-10-19 00:09:25 -070012886
12887PyDoc_STRVAR(unicode_maketrans__doc__,
Larry Hastings44e2eaa2013-11-23 15:37:55 -080012888"maketrans(x, y=None, z=None)\n"
Larry Hastings31826802013-10-19 00:09:25 -070012889"Return a translation table usable for str.translate().\n"
12890"\n"
Larry Hastings31826802013-10-19 00:09:25 -070012891"If there is only one argument, it must be a dictionary mapping Unicode\n"
12892"ordinals (integers) or characters to Unicode ordinals, strings or None.\n"
12893"Character keys will be then converted to ordinals.\n"
12894"If there are two arguments, they must be strings of equal length, and\n"
12895"in the resulting dictionary, each character in x will be mapped to the\n"
12896"character at the same position in y. If there is a third argument, it\n"
12897"must be a string, whose characters will be mapped to None in the result.");
12898
12899#define UNICODE_MAKETRANS_METHODDEF \
12900 {"maketrans", (PyCFunction)unicode_maketrans, METH_VARARGS|METH_STATIC, unicode_maketrans__doc__},
12901
12902static PyObject *
Larry Hastingsebdcb502013-11-23 14:54:00 -080012903unicode_maketrans_impl(void *null, PyObject *x, PyObject *y, PyObject *z);
Larry Hastings31826802013-10-19 00:09:25 -070012904
12905static PyObject *
Larry Hastingsebdcb502013-11-23 14:54:00 -080012906unicode_maketrans(void *null, PyObject *args)
Georg Brandlceee0772007-11-27 23:48:05 +000012907{
Larry Hastings31826802013-10-19 00:09:25 -070012908 PyObject *return_value = NULL;
12909 PyObject *x;
12910 PyObject *y = NULL;
12911 PyObject *z = NULL;
12912
12913 if (!PyArg_ParseTuple(args,
12914 "O|UU:maketrans",
12915 &x, &y, &z))
12916 goto exit;
Larry Hastingsebdcb502013-11-23 14:54:00 -080012917 return_value = unicode_maketrans_impl(null, x, y, z);
Larry Hastings31826802013-10-19 00:09:25 -070012918
12919exit:
12920 return return_value;
12921}
12922
12923static PyObject *
Larry Hastingsebdcb502013-11-23 14:54:00 -080012924unicode_maketrans_impl(void *null, PyObject *x, PyObject *y, PyObject *z)
Larry Hastings61272b72014-01-07 12:41:53 -080012925/*[clinic end generated code: checksum=7f76f414a0dfd0c614e0d4717872eeb520516da7]*/
Larry Hastings31826802013-10-19 00:09:25 -070012926{
Georg Brandlceee0772007-11-27 23:48:05 +000012927 PyObject *new = NULL, *key, *value;
12928 Py_ssize_t i = 0;
12929 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012930
Georg Brandlceee0772007-11-27 23:48:05 +000012931 new = PyDict_New();
12932 if (!new)
12933 return NULL;
12934 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012935 int x_kind, y_kind, z_kind;
12936 void *x_data, *y_data, *z_data;
12937
Georg Brandlceee0772007-11-27 23:48:05 +000012938 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000012939 if (!PyUnicode_Check(x)) {
12940 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12941 "be a string if there is a second argument");
12942 goto err;
12943 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012944 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012945 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12946 "arguments must have equal length");
12947 goto err;
12948 }
12949 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012950 x_kind = PyUnicode_KIND(x);
12951 y_kind = PyUnicode_KIND(y);
12952 x_data = PyUnicode_DATA(x);
12953 y_data = PyUnicode_DATA(y);
12954 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
12955 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012956 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000012957 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060012958 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012959 if (!value) {
12960 Py_DECREF(key);
12961 goto err;
12962 }
Georg Brandlceee0772007-11-27 23:48:05 +000012963 res = PyDict_SetItem(new, key, value);
12964 Py_DECREF(key);
12965 Py_DECREF(value);
12966 if (res < 0)
12967 goto err;
12968 }
12969 /* create entries for deleting chars in z */
12970 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012971 z_kind = PyUnicode_KIND(z);
12972 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012973 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012974 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012975 if (!key)
12976 goto err;
12977 res = PyDict_SetItem(new, key, Py_None);
12978 Py_DECREF(key);
12979 if (res < 0)
12980 goto err;
12981 }
12982 }
12983 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012984 int kind;
12985 void *data;
12986
Georg Brandlceee0772007-11-27 23:48:05 +000012987 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000012988 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012989 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
12990 "to maketrans it must be a dict");
12991 goto err;
12992 }
12993 /* copy entries into the new dict, converting string keys to int keys */
12994 while (PyDict_Next(x, &i, &key, &value)) {
12995 if (PyUnicode_Check(key)) {
12996 /* convert string keys to integer keys */
12997 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012998 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000012999 PyErr_SetString(PyExc_ValueError, "string keys in translate "
13000 "table must be of length 1");
13001 goto err;
13002 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013003 kind = PyUnicode_KIND(key);
13004 data = PyUnicode_DATA(key);
13005 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000013006 if (!newkey)
13007 goto err;
13008 res = PyDict_SetItem(new, newkey, value);
13009 Py_DECREF(newkey);
13010 if (res < 0)
13011 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000013012 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013013 /* just keep integer keys */
13014 if (PyDict_SetItem(new, key, value) < 0)
13015 goto err;
13016 } else {
13017 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13018 "be strings or integers");
13019 goto err;
13020 }
13021 }
13022 }
13023 return new;
13024 err:
13025 Py_DECREF(new);
13026 return NULL;
13027}
13028
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013029PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013030 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013031\n\
13032Return a copy of the string S, where all characters have been mapped\n\
13033through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000013034Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000013035Unmapped characters are left untouched. Characters mapped to None\n\
13036are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013037
13038static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013039unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013040{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013041 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013042}
13043
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013044PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013045 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013046\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013047Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013048
13049static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020013050unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013051{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050013052 if (PyUnicode_READY(self) == -1)
13053 return NULL;
13054 if (PyUnicode_IS_ASCII(self))
13055 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013056 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013057}
13058
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013059PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013060 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013061\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000013062Pad a numeric string S with zeros on the left, to fill a field\n\
13063of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013064
13065static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020013066unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013067{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013068 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020013069 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013070 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013071 int kind;
13072 void *data;
13073 Py_UCS4 chr;
13074
Martin v. Löwis18e16552006-02-15 17:27:45 +000013075 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000013076 return NULL;
13077
Benjamin Petersonbac79492012-01-14 13:34:47 -050013078 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010013079 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013080
Victor Stinnerc4b49542011-12-11 22:44:26 +010013081 if (PyUnicode_GET_LENGTH(self) >= width)
13082 return unicode_result_unchanged(self);
13083
13084 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013085
13086 u = pad(self, fill, 0, '0');
13087
Walter Dörwald068325e2002-04-15 13:36:47 +000013088 if (u == NULL)
13089 return NULL;
13090
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013091 kind = PyUnicode_KIND(u);
13092 data = PyUnicode_DATA(u);
13093 chr = PyUnicode_READ(kind, data, fill);
13094
13095 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000013096 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013097 PyUnicode_WRITE(kind, data, 0, chr);
13098 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000013099 }
13100
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013101 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010013102 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013103}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013104
13105#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013106static PyObject *
13107unicode__decimal2ascii(PyObject *self)
13108{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013109 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013110}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013111#endif
13112
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013113PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013114 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013115\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013116Return True if S starts with the specified prefix, False otherwise.\n\
13117With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013118With optional end, stop comparing S at that position.\n\
13119prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013120
13121static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013122unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013123 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013124{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013125 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013126 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013127 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013128 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013129 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013130
Jesus Ceaac451502011-04-20 17:09:23 +020013131 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013132 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013133 if (PyTuple_Check(subobj)) {
13134 Py_ssize_t i;
13135 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013136 substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013137 if (substring == NULL)
13138 return NULL;
13139 result = tailmatch(self, substring, start, end, -1);
13140 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010013141 if (result == -1)
13142 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013143 if (result) {
13144 Py_RETURN_TRUE;
13145 }
13146 }
13147 /* nothing matched */
13148 Py_RETURN_FALSE;
13149 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013150 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030013151 if (substring == NULL) {
13152 if (PyErr_ExceptionMatches(PyExc_TypeError))
13153 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
13154 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013155 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013156 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013157 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013158 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010013159 if (result == -1)
13160 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013161 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013162}
13163
13164
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013165PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013166 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013167\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013168Return True if S ends with the specified suffix, False otherwise.\n\
13169With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013170With optional end, stop comparing S at that position.\n\
13171suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013172
13173static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013174unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013175 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013176{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013177 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013178 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013179 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013180 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013181 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013182
Jesus Ceaac451502011-04-20 17:09:23 +020013183 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013184 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013185 if (PyTuple_Check(subobj)) {
13186 Py_ssize_t i;
13187 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013188 substring = PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000013189 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013190 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000013191 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013192 result = tailmatch(self, substring, start, end, +1);
13193 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010013194 if (result == -1)
13195 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013196 if (result) {
13197 Py_RETURN_TRUE;
13198 }
13199 }
13200 Py_RETURN_FALSE;
13201 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013202 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030013203 if (substring == NULL) {
13204 if (PyErr_ExceptionMatches(PyExc_TypeError))
13205 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
13206 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013207 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013208 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013209 result = tailmatch(self, substring, start, end, +1);
Christian Heimes305e49e2013-06-29 20:41:06 +020013210 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010013211 if (result == -1)
13212 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013213 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013214}
13215
Victor Stinner202fdca2012-05-07 12:47:02 +020013216Py_LOCAL_INLINE(void)
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013217_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013218{
Victor Stinner8f674cc2013-04-17 23:02:17 +020013219 if (!writer->readonly)
13220 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
13221 else {
13222 /* Copy-on-write mode: set buffer size to 0 so
13223 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13224 * next write. */
13225 writer->size = 0;
13226 }
Victor Stinner202fdca2012-05-07 12:47:02 +020013227 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13228 writer->data = PyUnicode_DATA(writer->buffer);
13229 writer->kind = PyUnicode_KIND(writer->buffer);
13230}
13231
Victor Stinnerd3f08822012-05-29 12:57:52 +020013232void
Victor Stinner8f674cc2013-04-17 23:02:17 +020013233_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013234{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013235 memset(writer, 0, sizeof(*writer));
13236#ifdef Py_DEBUG
13237 writer->kind = 5; /* invalid kind */
13238#endif
Victor Stinner8f674cc2013-04-17 23:02:17 +020013239 writer->min_char = 127;
Victor Stinner202fdca2012-05-07 12:47:02 +020013240}
13241
Victor Stinnerd3f08822012-05-29 12:57:52 +020013242int
13243_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13244 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020013245{
Victor Stinner6989ba02013-11-18 21:08:39 +010013246#ifdef MS_WINDOWS
13247 /* On Windows, overallocate by 50% is the best factor */
13248# define OVERALLOCATE_FACTOR 2
13249#else
13250 /* On Linux, overallocate by 25% is the best factor */
13251# define OVERALLOCATE_FACTOR 4
13252#endif
Victor Stinner202fdca2012-05-07 12:47:02 +020013253 Py_ssize_t newlen;
13254 PyObject *newbuffer;
13255
Victor Stinnerd3f08822012-05-29 12:57:52 +020013256 assert(length > 0);
13257
Victor Stinner202fdca2012-05-07 12:47:02 +020013258 if (length > PY_SSIZE_T_MAX - writer->pos) {
13259 PyErr_NoMemory();
13260 return -1;
13261 }
13262 newlen = writer->pos + length;
13263
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070013264 maxchar = Py_MAX(maxchar, writer->min_char);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013265
Victor Stinnerd3f08822012-05-29 12:57:52 +020013266 if (writer->buffer == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020013267 assert(!writer->readonly);
Victor Stinner6989ba02013-11-18 21:08:39 +010013268 if (writer->overallocate
13269 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13270 /* overallocate to limit the number of realloc() */
13271 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013272 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013273 if (newlen < writer->min_length)
13274 newlen = writer->min_length;
13275
Victor Stinnerd3f08822012-05-29 12:57:52 +020013276 writer->buffer = PyUnicode_New(newlen, maxchar);
13277 if (writer->buffer == NULL)
13278 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013279 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013280 else if (newlen > writer->size) {
Victor Stinner6989ba02013-11-18 21:08:39 +010013281 if (writer->overallocate
13282 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13283 /* overallocate to limit the number of realloc() */
13284 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013285 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013286 if (newlen < writer->min_length)
13287 newlen = writer->min_length;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013288
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013289 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020013290 /* resize + widen */
13291 newbuffer = PyUnicode_New(newlen, maxchar);
13292 if (newbuffer == NULL)
13293 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013294 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13295 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020013296 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013297 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020013298 }
13299 else {
13300 newbuffer = resize_compact(writer->buffer, newlen);
13301 if (newbuffer == NULL)
13302 return -1;
13303 }
13304 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013305 }
13306 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013307 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013308 newbuffer = PyUnicode_New(writer->size, maxchar);
13309 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020013310 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013311 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13312 writer->buffer, 0, writer->pos);
13313 Py_DECREF(writer->buffer);
13314 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013315 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013316 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013317 return 0;
Victor Stinner6989ba02013-11-18 21:08:39 +010013318
13319#undef OVERALLOCATE_FACTOR
Victor Stinner202fdca2012-05-07 12:47:02 +020013320}
13321
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013322Py_LOCAL_INLINE(int)
13323_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
Victor Stinnera0dd0212013-04-11 22:09:04 +020013324{
13325 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13326 return -1;
13327 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13328 writer->pos++;
13329 return 0;
13330}
13331
13332int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013333_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13334{
13335 return _PyUnicodeWriter_WriteCharInline(writer, ch);
13336}
13337
13338int
Victor Stinnerd3f08822012-05-29 12:57:52 +020013339_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13340{
13341 Py_UCS4 maxchar;
13342 Py_ssize_t len;
13343
13344 if (PyUnicode_READY(str) == -1)
13345 return -1;
13346 len = PyUnicode_GET_LENGTH(str);
13347 if (len == 0)
13348 return 0;
13349 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13350 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013351 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020013352 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013353 Py_INCREF(str);
13354 writer->buffer = str;
13355 _PyUnicodeWriter_Update(writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013356 writer->pos += len;
13357 return 0;
13358 }
13359 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13360 return -1;
13361 }
13362 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13363 str, 0, len);
13364 writer->pos += len;
13365 return 0;
13366}
13367
Victor Stinnere215d962012-10-06 23:03:36 +020013368int
Victor Stinnercfc4c132013-04-03 01:48:39 +020013369_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13370 Py_ssize_t start, Py_ssize_t end)
13371{
13372 Py_UCS4 maxchar;
13373 Py_ssize_t len;
13374
13375 if (PyUnicode_READY(str) == -1)
13376 return -1;
13377
13378 assert(0 <= start);
13379 assert(end <= PyUnicode_GET_LENGTH(str));
13380 assert(start <= end);
13381
13382 if (end == 0)
13383 return 0;
13384
13385 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
13386 return _PyUnicodeWriter_WriteStr(writer, str);
13387
13388 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
13389 maxchar = _PyUnicode_FindMaxChar(str, start, end);
13390 else
13391 maxchar = writer->maxchar;
13392 len = end - start;
13393
13394 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
13395 return -1;
13396
13397 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13398 str, start, len);
13399 writer->pos += len;
13400 return 0;
13401}
13402
13403int
Victor Stinner4a587072013-11-19 12:54:53 +010013404_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
13405 const char *ascii, Py_ssize_t len)
13406{
13407 if (len == -1)
13408 len = strlen(ascii);
13409
13410 assert(ucs1lib_find_max_char((Py_UCS1*)ascii, (Py_UCS1*)ascii + len) < 128);
13411
13412 if (writer->buffer == NULL && !writer->overallocate) {
13413 PyObject *str;
13414
13415 str = _PyUnicode_FromASCII(ascii, len);
13416 if (str == NULL)
13417 return -1;
13418
13419 writer->readonly = 1;
13420 writer->buffer = str;
13421 _PyUnicodeWriter_Update(writer);
13422 writer->pos += len;
13423 return 0;
13424 }
13425
13426 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
13427 return -1;
13428
13429 switch (writer->kind)
13430 {
13431 case PyUnicode_1BYTE_KIND:
13432 {
13433 const Py_UCS1 *str = (const Py_UCS1 *)ascii;
13434 Py_UCS1 *data = writer->data;
13435
13436 Py_MEMCPY(data + writer->pos, str, len);
13437 break;
13438 }
13439 case PyUnicode_2BYTE_KIND:
13440 {
13441 _PyUnicode_CONVERT_BYTES(
13442 Py_UCS1, Py_UCS2,
13443 ascii, ascii + len,
13444 (Py_UCS2 *)writer->data + writer->pos);
13445 break;
13446 }
13447 case PyUnicode_4BYTE_KIND:
13448 {
13449 _PyUnicode_CONVERT_BYTES(
13450 Py_UCS1, Py_UCS4,
13451 ascii, ascii + len,
13452 (Py_UCS4 *)writer->data + writer->pos);
13453 break;
13454 }
13455 default:
13456 assert(0);
13457 }
13458
13459 writer->pos += len;
13460 return 0;
13461}
13462
13463int
13464_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
13465 const char *str, Py_ssize_t len)
Victor Stinnere215d962012-10-06 23:03:36 +020013466{
13467 Py_UCS4 maxchar;
13468
13469 maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len);
13470 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
13471 return -1;
13472 unicode_write_cstr(writer->buffer, writer->pos, str, len);
13473 writer->pos += len;
13474 return 0;
13475}
13476
Victor Stinnerd3f08822012-05-29 12:57:52 +020013477PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013478_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013479{
Victor Stinner15a0bd32013-07-08 22:29:55 +020013480 PyObject *str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013481 if (writer->pos == 0) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013482 Py_CLEAR(writer->buffer);
Serhiy Storchaka678db842013-01-26 12:16:36 +020013483 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3f08822012-05-29 12:57:52 +020013484 }
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013485 if (writer->readonly) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013486 str = writer->buffer;
13487 writer->buffer = NULL;
13488 assert(PyUnicode_GET_LENGTH(str) == writer->pos);
13489 return str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013490 }
13491 if (PyUnicode_GET_LENGTH(writer->buffer) != writer->pos) {
13492 PyObject *newbuffer;
13493 newbuffer = resize_compact(writer->buffer, writer->pos);
13494 if (newbuffer == NULL) {
13495 Py_DECREF(writer->buffer);
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013496 writer->buffer = NULL;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013497 return NULL;
13498 }
13499 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013500 }
Victor Stinner15a0bd32013-07-08 22:29:55 +020013501 str = writer->buffer;
13502 writer->buffer = NULL;
13503 assert(_PyUnicode_CheckConsistency(str, 1));
13504 return unicode_result_ready(str);
Victor Stinner202fdca2012-05-07 12:47:02 +020013505}
13506
Victor Stinnerd3f08822012-05-29 12:57:52 +020013507void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013508_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013509{
13510 Py_CLEAR(writer->buffer);
13511}
13512
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013513#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000013514
13515PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013516 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013517\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013518Return a formatted version of S, using substitutions from args and kwargs.\n\
13519The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000013520
Eric Smith27bbca62010-11-04 17:06:58 +000013521PyDoc_STRVAR(format_map__doc__,
13522 "S.format_map(mapping) -> str\n\
13523\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013524Return a formatted version of S, using substitutions from mapping.\n\
13525The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000013526
Eric Smith4a7d76d2008-05-30 18:10:19 +000013527static PyObject *
13528unicode__format__(PyObject* self, PyObject* args)
13529{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013530 PyObject *format_spec;
13531 _PyUnicodeWriter writer;
13532 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000013533
13534 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
13535 return NULL;
13536
Victor Stinnerd3f08822012-05-29 12:57:52 +020013537 if (PyUnicode_READY(self) == -1)
13538 return NULL;
Victor Stinner8f674cc2013-04-17 23:02:17 +020013539 _PyUnicodeWriter_Init(&writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013540 ret = _PyUnicode_FormatAdvancedWriter(&writer,
13541 self, format_spec, 0,
13542 PyUnicode_GET_LENGTH(format_spec));
13543 if (ret == -1) {
13544 _PyUnicodeWriter_Dealloc(&writer);
13545 return NULL;
13546 }
13547 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000013548}
13549
Eric Smith8c663262007-08-25 02:26:07 +000013550PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013551 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013552\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013553Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000013554
13555static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013556unicode__sizeof__(PyObject *v)
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013557{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013558 Py_ssize_t size;
13559
13560 /* If it's a compact object, account for base structure +
13561 character data. */
13562 if (PyUnicode_IS_COMPACT_ASCII(v))
13563 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
13564 else if (PyUnicode_IS_COMPACT(v))
13565 size = sizeof(PyCompactUnicodeObject) +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013566 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013567 else {
13568 /* If it is a two-block object, account for base object, and
13569 for character block if present. */
13570 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020013571 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013572 size += (PyUnicode_GET_LENGTH(v) + 1) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013573 PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013574 }
13575 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020013576 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020013577 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013578 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020013579 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020013580 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013581
13582 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013583}
13584
13585PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013586 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013587
13588static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020013589unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013590{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010013591 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013592 if (!copy)
13593 return NULL;
13594 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013595}
13596
Guido van Rossumd57fd912000-03-10 22:53:23 +000013597static PyMethodDef unicode_methods[] = {
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000013598 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013599 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
Ezio Melotticda6b6d2012-02-26 09:39:55 +020013600 {"split", (PyCFunction) unicode_split, METH_VARARGS | METH_KEYWORDS, split__doc__},
13601 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS | METH_KEYWORDS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013602 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
13603 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
Benjamin Petersond5890c82012-01-14 13:23:30 -050013604 {"casefold", (PyCFunction) unicode_casefold, METH_NOARGS, casefold__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013605 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
13606 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
13607 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
Ezio Melotti745d54d2013-11-16 19:10:57 +020013608 {"expandtabs", (PyCFunction) unicode_expandtabs,
13609 METH_VARARGS | METH_KEYWORDS, expandtabs__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013610 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013611 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013612 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
13613 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
13614 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013615 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013616 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13617 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
13618 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013619 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013620 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Ezio Melotti745d54d2013-11-16 19:10:57 +020013621 {"splitlines", (PyCFunction) unicode_splitlines,
13622 METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013623 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013624 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
13625 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
13626 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
13627 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13628 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
13629 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
13630 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
13631 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
13632 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
13633 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
13634 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
13635 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
13636 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
13637 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000013638 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000013639 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013640 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000013641 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000013642 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000013643 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Larry Hastings31826802013-10-19 00:09:25 -070013644 UNICODE_MAKETRANS_METHODDEF
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013645 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000013646#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013647 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013648 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013649#endif
13650
Benjamin Peterson14339b62009-01-31 16:36:08 +000013651 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013652 {NULL, NULL}
13653};
13654
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013655static PyObject *
13656unicode_mod(PyObject *v, PyObject *w)
13657{
Brian Curtindfc80e32011-08-10 20:28:54 -050013658 if (!PyUnicode_Check(v))
13659 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000013660 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013661}
13662
13663static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013664 0, /*nb_add*/
13665 0, /*nb_subtract*/
13666 0, /*nb_multiply*/
13667 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013668};
13669
Guido van Rossumd57fd912000-03-10 22:53:23 +000013670static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013671 (lenfunc) unicode_length, /* sq_length */
13672 PyUnicode_Concat, /* sq_concat */
13673 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13674 (ssizeargfunc) unicode_getitem, /* sq_item */
13675 0, /* sq_slice */
13676 0, /* sq_ass_item */
13677 0, /* sq_ass_slice */
13678 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013679};
13680
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013681static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013682unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013683{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013684 if (PyUnicode_READY(self) == -1)
13685 return NULL;
13686
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013687 if (PyIndex_Check(item)) {
13688 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013689 if (i == -1 && PyErr_Occurred())
13690 return NULL;
13691 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013692 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013693 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013694 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000013695 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013696 PyObject *result;
13697 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013698 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013699 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013700
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013701 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000013702 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013703 return NULL;
13704 }
13705
13706 if (slicelength <= 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020013707 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013708 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010013709 slicelength == PyUnicode_GET_LENGTH(self)) {
13710 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000013711 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013712 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013713 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013714 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013715 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013716 src_kind = PyUnicode_KIND(self);
13717 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020013718 if (!PyUnicode_IS_ASCII(self)) {
13719 kind_limit = kind_maxchar_limit(src_kind);
13720 max_char = 0;
13721 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13722 ch = PyUnicode_READ(src_kind, src_data, cur);
13723 if (ch > max_char) {
13724 max_char = ch;
13725 if (max_char >= kind_limit)
13726 break;
13727 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013728 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013729 }
Victor Stinner55c99112011-10-13 01:17:06 +020013730 else
13731 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013732 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013733 if (result == NULL)
13734 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013735 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013736 dest_data = PyUnicode_DATA(result);
13737
13738 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013739 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13740 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013741 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013742 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013743 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013744 } else {
13745 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13746 return NULL;
13747 }
13748}
13749
13750static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013751 (lenfunc)unicode_length, /* mp_length */
13752 (binaryfunc)unicode_subscript, /* mp_subscript */
13753 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013754};
13755
Guido van Rossumd57fd912000-03-10 22:53:23 +000013756
Guido van Rossumd57fd912000-03-10 22:53:23 +000013757/* Helpers for PyUnicode_Format() */
13758
Victor Stinnera47082312012-10-04 02:19:54 +020013759struct unicode_formatter_t {
13760 PyObject *args;
13761 int args_owned;
13762 Py_ssize_t arglen, argidx;
13763 PyObject *dict;
13764
13765 enum PyUnicode_Kind fmtkind;
13766 Py_ssize_t fmtcnt, fmtpos;
13767 void *fmtdata;
13768 PyObject *fmtstr;
13769
13770 _PyUnicodeWriter writer;
13771};
13772
13773struct unicode_format_arg_t {
13774 Py_UCS4 ch;
13775 int flags;
13776 Py_ssize_t width;
13777 int prec;
13778 int sign;
13779};
13780
Guido van Rossumd57fd912000-03-10 22:53:23 +000013781static PyObject *
Victor Stinnera47082312012-10-04 02:19:54 +020013782unicode_format_getnextarg(struct unicode_formatter_t *ctx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013783{
Victor Stinnera47082312012-10-04 02:19:54 +020013784 Py_ssize_t argidx = ctx->argidx;
13785
13786 if (argidx < ctx->arglen) {
13787 ctx->argidx++;
13788 if (ctx->arglen < 0)
13789 return ctx->args;
Benjamin Peterson29060642009-01-31 22:14:21 +000013790 else
Victor Stinnera47082312012-10-04 02:19:54 +020013791 return PyTuple_GetItem(ctx->args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013792 }
13793 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013794 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013795 return NULL;
13796}
13797
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013798/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013799
Victor Stinnera47082312012-10-04 02:19:54 +020013800/* Format a float into the writer if the writer is not NULL, or into *p_output
13801 otherwise.
13802
13803 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +020013804static int
Victor Stinnera47082312012-10-04 02:19:54 +020013805formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
13806 PyObject **p_output,
13807 _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013808{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013809 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013810 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013811 Py_ssize_t len;
Victor Stinnera47082312012-10-04 02:19:54 +020013812 int prec;
13813 int dtoa_flags;
Tim Petersced69f82003-09-16 20:30:58 +000013814
Guido van Rossumd57fd912000-03-10 22:53:23 +000013815 x = PyFloat_AsDouble(v);
13816 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020013817 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013818
Victor Stinnera47082312012-10-04 02:19:54 +020013819 prec = arg->prec;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013820 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013821 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000013822
Victor Stinnera47082312012-10-04 02:19:54 +020013823 if (arg->flags & F_ALT)
13824 dtoa_flags = Py_DTSF_ALT;
13825 else
13826 dtoa_flags = 0;
13827 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013828 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020013829 return -1;
13830 len = strlen(p);
13831 if (writer) {
Victor Stinner4a587072013-11-19 12:54:53 +010013832 if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
Christian Heimesf4f99392012-09-10 11:48:41 +020013833 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013834 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020013835 }
Victor Stinnerd3f08822012-05-29 12:57:52 +020013836 }
13837 else
13838 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000013839 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013840 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013841}
13842
Victor Stinnerd0880d52012-04-27 23:40:13 +020013843/* formatlong() emulates the format codes d, u, o, x and X, and
13844 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
13845 * Python's regular ints.
13846 * Return value: a new PyUnicodeObject*, or NULL if error.
13847 * The output string is of the form
13848 * "-"? ("0x" | "0X")? digit+
13849 * "0x"/"0X" are present only for x and X conversions, with F_ALT
13850 * set in flags. The case of hex digits will be correct,
13851 * There will be at least prec digits, zero-filled on the left if
13852 * necessary to get that many.
13853 * val object to be converted
13854 * flags bitmask of format flags; only F_ALT is looked at
13855 * prec minimum number of digits; 0-fill on left if needed
13856 * type a character in [duoxX]; u acts the same as d
13857 *
13858 * CAUTION: o, x and X conversions on regular ints can never
13859 * produce a '-' sign, but can for Python's unbounded ints.
13860 */
Tim Peters38fd5b62000-09-21 05:43:11 +000013861static PyObject*
Victor Stinnera47082312012-10-04 02:19:54 +020013862formatlong(PyObject *val, struct unicode_format_arg_t *arg)
Tim Peters38fd5b62000-09-21 05:43:11 +000013863{
Victor Stinnerd0880d52012-04-27 23:40:13 +020013864 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013865 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013866 Py_ssize_t i;
13867 int sign; /* 1 if '-', else 0 */
13868 int len; /* number of characters */
13869 Py_ssize_t llen;
13870 int numdigits; /* len == numnondigits + numdigits */
13871 int numnondigits = 0;
Victor Stinnera47082312012-10-04 02:19:54 +020013872 int prec = arg->prec;
13873 int type = arg->ch;
Tim Peters38fd5b62000-09-21 05:43:11 +000013874
Victor Stinnerd0880d52012-04-27 23:40:13 +020013875 /* Avoid exceeding SSIZE_T_MAX */
13876 if (prec > INT_MAX-3) {
13877 PyErr_SetString(PyExc_OverflowError,
13878 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000013879 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013880 }
13881
13882 assert(PyLong_Check(val));
13883
13884 switch (type) {
Victor Stinner621ef3d2012-10-02 00:33:47 +020013885 default:
13886 assert(!"'type' not in [diuoxX]");
Victor Stinnerd0880d52012-04-27 23:40:13 +020013887 case 'd':
Victor Stinner621ef3d2012-10-02 00:33:47 +020013888 case 'i':
Victor Stinnerd0880d52012-04-27 23:40:13 +020013889 case 'u':
Ethan Furmanfb137212013-08-31 10:18:55 -070013890 /* int and int subclasses should print numerically when a numeric */
13891 /* format code is used (see issue18780) */
13892 result = PyNumber_ToBase(val, 10);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013893 break;
13894 case 'o':
13895 numnondigits = 2;
13896 result = PyNumber_ToBase(val, 8);
13897 break;
13898 case 'x':
13899 case 'X':
13900 numnondigits = 2;
13901 result = PyNumber_ToBase(val, 16);
13902 break;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013903 }
13904 if (!result)
13905 return NULL;
13906
13907 assert(unicode_modifiable(result));
13908 assert(PyUnicode_IS_READY(result));
13909 assert(PyUnicode_IS_ASCII(result));
13910
13911 /* To modify the string in-place, there can only be one reference. */
13912 if (Py_REFCNT(result) != 1) {
Christian Heimesd47802e2013-06-29 21:33:36 +020013913 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013914 PyErr_BadInternalCall();
13915 return NULL;
13916 }
13917 buf = PyUnicode_DATA(result);
13918 llen = PyUnicode_GET_LENGTH(result);
13919 if (llen > INT_MAX) {
Christian Heimesd47802e2013-06-29 21:33:36 +020013920 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013921 PyErr_SetString(PyExc_ValueError,
13922 "string too large in _PyBytes_FormatLong");
13923 return NULL;
13924 }
13925 len = (int)llen;
13926 sign = buf[0] == '-';
13927 numnondigits += sign;
13928 numdigits = len - numnondigits;
13929 assert(numdigits > 0);
13930
13931 /* Get rid of base marker unless F_ALT */
Victor Stinnera47082312012-10-04 02:19:54 +020013932 if (((arg->flags & F_ALT) == 0 &&
Victor Stinnerd0880d52012-04-27 23:40:13 +020013933 (type == 'o' || type == 'x' || type == 'X'))) {
13934 assert(buf[sign] == '0');
13935 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
13936 buf[sign+1] == 'o');
13937 numnondigits -= 2;
13938 buf += 2;
13939 len -= 2;
13940 if (sign)
13941 buf[0] = '-';
13942 assert(len == numnondigits + numdigits);
13943 assert(numdigits > 0);
13944 }
13945
13946 /* Fill with leading zeroes to meet minimum width. */
13947 if (prec > numdigits) {
13948 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
13949 numnondigits + prec);
13950 char *b1;
13951 if (!r1) {
13952 Py_DECREF(result);
13953 return NULL;
13954 }
13955 b1 = PyBytes_AS_STRING(r1);
13956 for (i = 0; i < numnondigits; ++i)
13957 *b1++ = *buf++;
13958 for (i = 0; i < prec - numdigits; i++)
13959 *b1++ = '0';
13960 for (i = 0; i < numdigits; i++)
13961 *b1++ = *buf++;
13962 *b1 = '\0';
13963 Py_DECREF(result);
13964 result = r1;
13965 buf = PyBytes_AS_STRING(result);
13966 len = numnondigits + prec;
13967 }
13968
13969 /* Fix up case for hex conversions. */
13970 if (type == 'X') {
13971 /* Need to convert all lower case letters to upper case.
13972 and need to convert 0x to 0X (and -0x to -0X). */
13973 for (i = 0; i < len; i++)
13974 if (buf[i] >= 'a' && buf[i] <= 'x')
13975 buf[i] -= 'a'-'A';
13976 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020013977 if (!PyUnicode_Check(result)
13978 || buf != PyUnicode_DATA(result)) {
Victor Stinnerd0880d52012-04-27 23:40:13 +020013979 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013980 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013981 Py_DECREF(result);
13982 result = unicode;
13983 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020013984 else if (len != PyUnicode_GET_LENGTH(result)) {
13985 if (PyUnicode_Resize(&result, len) < 0)
13986 Py_CLEAR(result);
13987 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000013988 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000013989}
13990
Ethan Furmandf3ed242014-01-05 06:50:30 -080013991/* Format an integer or a float as an integer.
Victor Stinner621ef3d2012-10-02 00:33:47 +020013992 * Return 1 if the number has been formatted into the writer,
Victor Stinnera47082312012-10-04 02:19:54 +020013993 * 0 if the number has been formatted into *p_output
Victor Stinner621ef3d2012-10-02 00:33:47 +020013994 * -1 and raise an exception on error */
13995static int
Victor Stinnera47082312012-10-04 02:19:54 +020013996mainformatlong(PyObject *v,
13997 struct unicode_format_arg_t *arg,
13998 PyObject **p_output,
13999 _PyUnicodeWriter *writer)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014000{
14001 PyObject *iobj, *res;
Victor Stinnera47082312012-10-04 02:19:54 +020014002 char type = (char)arg->ch;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014003
14004 if (!PyNumber_Check(v))
14005 goto wrongtype;
14006
Ethan Furmanf9bba9c2014-01-11 23:20:58 -080014007 /* make sure number is a type of integer */
Ethan Furmana70805e2014-01-12 08:42:35 -080014008 /* if not, issue deprecation warning for now */
Victor Stinner621ef3d2012-10-02 00:33:47 +020014009 if (!PyLong_Check(v)) {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014010 if (type == 'o' || type == 'x' || type == 'X') {
14011 iobj = PyNumber_Index(v);
14012 if (iobj == NULL) {
Ethan Furmanf9bba9c2014-01-11 23:20:58 -080014013 PyErr_Clear();
14014 if (PyErr_WarnEx(PyExc_DeprecationWarning,
14015 "automatic int conversions have been deprecated",
14016 1)) {
14017 return -1;
14018 }
14019 iobj = PyNumber_Long(v);
14020 if (iobj == NULL ) {
14021 if (PyErr_ExceptionMatches(PyExc_TypeError))
14022 goto wrongtype;
14023 return -1;
14024 }
Ethan Furmandf3ed242014-01-05 06:50:30 -080014025 }
14026 }
14027 else {
14028 iobj = PyNumber_Long(v);
14029 if (iobj == NULL ) {
14030 if (PyErr_ExceptionMatches(PyExc_TypeError))
14031 goto wrongtype;
14032 return -1;
14033 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014034 }
14035 assert(PyLong_Check(iobj));
14036 }
14037 else {
14038 iobj = v;
14039 Py_INCREF(iobj);
14040 }
14041
14042 if (PyLong_CheckExact(v)
Victor Stinnera47082312012-10-04 02:19:54 +020014043 && arg->width == -1 && arg->prec == -1
14044 && !(arg->flags & (F_SIGN | F_BLANK))
14045 && type != 'X')
Victor Stinner621ef3d2012-10-02 00:33:47 +020014046 {
14047 /* Fast path */
Victor Stinnera47082312012-10-04 02:19:54 +020014048 int alternate = arg->flags & F_ALT;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014049 int base;
14050
Victor Stinnera47082312012-10-04 02:19:54 +020014051 switch(type)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014052 {
14053 default:
14054 assert(0 && "'type' not in [diuoxX]");
14055 case 'd':
14056 case 'i':
14057 case 'u':
14058 base = 10;
14059 break;
14060 case 'o':
14061 base = 8;
14062 break;
14063 case 'x':
14064 case 'X':
14065 base = 16;
14066 break;
14067 }
14068
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014069 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14070 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014071 return -1;
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014072 }
14073 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014074 return 1;
14075 }
14076
Victor Stinnera47082312012-10-04 02:19:54 +020014077 res = formatlong(iobj, arg);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014078 Py_DECREF(iobj);
14079 if (res == NULL)
14080 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014081 *p_output = res;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014082 return 0;
14083
14084wrongtype:
14085 PyErr_Format(PyExc_TypeError,
14086 "%%%c format: a number is required, "
Victor Stinnera47082312012-10-04 02:19:54 +020014087 "not %.200s",
14088 type, Py_TYPE(v)->tp_name);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014089 return -1;
14090}
14091
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014092static Py_UCS4
14093formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014094{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014095 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014096 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014097 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014098 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000014099 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014100 goto onError;
14101 }
14102 else {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014103 PyObject *iobj;
Benjamin Peterson29060642009-01-31 22:14:21 +000014104 long x;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014105 /* make sure number is a type of integer */
Ethan Furmana70805e2014-01-12 08:42:35 -080014106 /* if not, issue deprecation warning for now */
Ethan Furmandf3ed242014-01-05 06:50:30 -080014107 if (!PyLong_Check(v)) {
14108 iobj = PyNumber_Index(v);
14109 if (iobj == NULL) {
Ethan Furmanf9bba9c2014-01-11 23:20:58 -080014110 PyErr_Clear();
14111 if (PyErr_WarnEx(PyExc_DeprecationWarning,
14112 "automatic int conversions have been deprecated",
14113 1)) {
14114 return -1;
14115 }
14116 iobj = PyNumber_Long(v);
14117 if (iobj == NULL ) {
14118 if (PyErr_ExceptionMatches(PyExc_TypeError))
14119 goto onError;
14120 return -1;
14121 }
Ethan Furmandf3ed242014-01-05 06:50:30 -080014122 }
14123 v = iobj;
14124 Py_DECREF(iobj);
14125 }
14126 /* Integer input truncated to a character */
Benjamin Peterson29060642009-01-31 22:14:21 +000014127 x = PyLong_AsLong(v);
14128 if (x == -1 && PyErr_Occurred())
14129 goto onError;
14130
Victor Stinner8faf8212011-12-08 22:14:11 +010014131 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014132 PyErr_SetString(PyExc_OverflowError,
14133 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014134 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000014135 }
14136
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014137 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014138 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014139
Benjamin Peterson29060642009-01-31 22:14:21 +000014140 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014141 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014142 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014143 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014144}
14145
Victor Stinnera47082312012-10-04 02:19:54 +020014146/* Parse options of an argument: flags, width, precision.
14147 Handle also "%(name)" syntax.
14148
14149 Return 0 if the argument has been formatted into arg->str.
14150 Return 1 if the argument has been written into ctx->writer,
14151 Raise an exception and return -1 on error. */
14152static int
14153unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14154 struct unicode_format_arg_t *arg)
14155{
14156#define FORMAT_READ(ctx) \
14157 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14158
14159 PyObject *v;
14160
Victor Stinnera47082312012-10-04 02:19:54 +020014161 if (arg->ch == '(') {
14162 /* Get argument value from a dictionary. Example: "%(name)s". */
14163 Py_ssize_t keystart;
14164 Py_ssize_t keylen;
14165 PyObject *key;
14166 int pcount = 1;
14167
14168 if (ctx->dict == NULL) {
14169 PyErr_SetString(PyExc_TypeError,
14170 "format requires a mapping");
14171 return -1;
14172 }
14173 ++ctx->fmtpos;
14174 --ctx->fmtcnt;
14175 keystart = ctx->fmtpos;
14176 /* Skip over balanced parentheses */
14177 while (pcount > 0 && --ctx->fmtcnt >= 0) {
14178 arg->ch = FORMAT_READ(ctx);
14179 if (arg->ch == ')')
14180 --pcount;
14181 else if (arg->ch == '(')
14182 ++pcount;
14183 ctx->fmtpos++;
14184 }
14185 keylen = ctx->fmtpos - keystart - 1;
14186 if (ctx->fmtcnt < 0 || pcount > 0) {
14187 PyErr_SetString(PyExc_ValueError,
14188 "incomplete format key");
14189 return -1;
14190 }
14191 key = PyUnicode_Substring(ctx->fmtstr,
14192 keystart, keystart + keylen);
14193 if (key == NULL)
14194 return -1;
14195 if (ctx->args_owned) {
14196 Py_DECREF(ctx->args);
14197 ctx->args_owned = 0;
14198 }
14199 ctx->args = PyObject_GetItem(ctx->dict, key);
14200 Py_DECREF(key);
14201 if (ctx->args == NULL)
14202 return -1;
14203 ctx->args_owned = 1;
14204 ctx->arglen = -1;
14205 ctx->argidx = -2;
14206 }
14207
14208 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
Victor Stinnera47082312012-10-04 02:19:54 +020014209 while (--ctx->fmtcnt >= 0) {
14210 arg->ch = FORMAT_READ(ctx);
14211 ctx->fmtpos++;
14212 switch (arg->ch) {
14213 case '-': arg->flags |= F_LJUST; continue;
14214 case '+': arg->flags |= F_SIGN; continue;
14215 case ' ': arg->flags |= F_BLANK; continue;
14216 case '#': arg->flags |= F_ALT; continue;
14217 case '0': arg->flags |= F_ZERO; continue;
14218 }
14219 break;
14220 }
14221
14222 /* Parse width. Example: "%10s" => width=10 */
Victor Stinnera47082312012-10-04 02:19:54 +020014223 if (arg->ch == '*') {
14224 v = unicode_format_getnextarg(ctx);
14225 if (v == NULL)
14226 return -1;
14227 if (!PyLong_Check(v)) {
14228 PyErr_SetString(PyExc_TypeError,
14229 "* wants int");
14230 return -1;
14231 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014232 arg->width = PyLong_AsSsize_t(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014233 if (arg->width == -1 && PyErr_Occurred())
14234 return -1;
14235 if (arg->width < 0) {
14236 arg->flags |= F_LJUST;
14237 arg->width = -arg->width;
14238 }
14239 if (--ctx->fmtcnt >= 0) {
14240 arg->ch = FORMAT_READ(ctx);
14241 ctx->fmtpos++;
14242 }
14243 }
14244 else if (arg->ch >= '0' && arg->ch <= '9') {
14245 arg->width = arg->ch - '0';
14246 while (--ctx->fmtcnt >= 0) {
14247 arg->ch = FORMAT_READ(ctx);
14248 ctx->fmtpos++;
14249 if (arg->ch < '0' || arg->ch > '9')
14250 break;
14251 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14252 mixing signed and unsigned comparison. Since arg->ch is between
14253 '0' and '9', casting to int is safe. */
14254 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14255 PyErr_SetString(PyExc_ValueError,
14256 "width too big");
14257 return -1;
14258 }
14259 arg->width = arg->width*10 + (arg->ch - '0');
14260 }
14261 }
14262
14263 /* Parse precision. Example: "%.3f" => prec=3 */
Victor Stinnera47082312012-10-04 02:19:54 +020014264 if (arg->ch == '.') {
14265 arg->prec = 0;
14266 if (--ctx->fmtcnt >= 0) {
14267 arg->ch = FORMAT_READ(ctx);
14268 ctx->fmtpos++;
14269 }
14270 if (arg->ch == '*') {
14271 v = unicode_format_getnextarg(ctx);
14272 if (v == NULL)
14273 return -1;
14274 if (!PyLong_Check(v)) {
14275 PyErr_SetString(PyExc_TypeError,
14276 "* wants int");
14277 return -1;
14278 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014279 arg->prec = _PyLong_AsInt(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014280 if (arg->prec == -1 && PyErr_Occurred())
14281 return -1;
14282 if (arg->prec < 0)
14283 arg->prec = 0;
14284 if (--ctx->fmtcnt >= 0) {
14285 arg->ch = FORMAT_READ(ctx);
14286 ctx->fmtpos++;
14287 }
14288 }
14289 else if (arg->ch >= '0' && arg->ch <= '9') {
14290 arg->prec = arg->ch - '0';
14291 while (--ctx->fmtcnt >= 0) {
14292 arg->ch = FORMAT_READ(ctx);
14293 ctx->fmtpos++;
14294 if (arg->ch < '0' || arg->ch > '9')
14295 break;
14296 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
14297 PyErr_SetString(PyExc_ValueError,
Victor Stinner3921e902012-10-06 23:05:00 +020014298 "precision too big");
Victor Stinnera47082312012-10-04 02:19:54 +020014299 return -1;
14300 }
14301 arg->prec = arg->prec*10 + (arg->ch - '0');
14302 }
14303 }
14304 }
14305
14306 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
14307 if (ctx->fmtcnt >= 0) {
14308 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
14309 if (--ctx->fmtcnt >= 0) {
14310 arg->ch = FORMAT_READ(ctx);
14311 ctx->fmtpos++;
14312 }
14313 }
14314 }
14315 if (ctx->fmtcnt < 0) {
14316 PyErr_SetString(PyExc_ValueError,
14317 "incomplete format");
14318 return -1;
14319 }
14320 return 0;
14321
14322#undef FORMAT_READ
14323}
14324
14325/* Format one argument. Supported conversion specifiers:
14326
14327 - "s", "r", "a": any type
Ethan Furmandf3ed242014-01-05 06:50:30 -080014328 - "i", "d", "u": int or float
14329 - "o", "x", "X": int
Victor Stinnera47082312012-10-04 02:19:54 +020014330 - "e", "E", "f", "F", "g", "G": float
14331 - "c": int or str (1 character)
14332
Victor Stinner8dbd4212012-12-04 09:30:24 +010014333 When possible, the output is written directly into the Unicode writer
14334 (ctx->writer). A string is created when padding is required.
14335
Victor Stinnera47082312012-10-04 02:19:54 +020014336 Return 0 if the argument has been formatted into *p_str,
14337 1 if the argument has been written into ctx->writer,
Victor Stinner8dbd4212012-12-04 09:30:24 +010014338 -1 on error. */
Victor Stinnera47082312012-10-04 02:19:54 +020014339static int
14340unicode_format_arg_format(struct unicode_formatter_t *ctx,
14341 struct unicode_format_arg_t *arg,
14342 PyObject **p_str)
14343{
14344 PyObject *v;
14345 _PyUnicodeWriter *writer = &ctx->writer;
14346
14347 if (ctx->fmtcnt == 0)
14348 ctx->writer.overallocate = 0;
14349
14350 if (arg->ch == '%') {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014351 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014352 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014353 return 1;
14354 }
14355
14356 v = unicode_format_getnextarg(ctx);
14357 if (v == NULL)
14358 return -1;
14359
Victor Stinnera47082312012-10-04 02:19:54 +020014360
14361 switch (arg->ch) {
Victor Stinnera47082312012-10-04 02:19:54 +020014362 case 's':
14363 case 'r':
14364 case 'a':
14365 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
14366 /* Fast path */
14367 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
14368 return -1;
14369 return 1;
14370 }
14371
14372 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
14373 *p_str = v;
14374 Py_INCREF(*p_str);
14375 }
14376 else {
14377 if (arg->ch == 's')
14378 *p_str = PyObject_Str(v);
14379 else if (arg->ch == 'r')
14380 *p_str = PyObject_Repr(v);
14381 else
14382 *p_str = PyObject_ASCII(v);
14383 }
14384 break;
14385
14386 case 'i':
14387 case 'd':
14388 case 'u':
14389 case 'o':
14390 case 'x':
14391 case 'X':
14392 {
14393 int ret = mainformatlong(v, arg, p_str, writer);
14394 if (ret != 0)
14395 return ret;
14396 arg->sign = 1;
14397 break;
14398 }
14399
14400 case 'e':
14401 case 'E':
14402 case 'f':
14403 case 'F':
14404 case 'g':
14405 case 'G':
14406 if (arg->width == -1 && arg->prec == -1
14407 && !(arg->flags & (F_SIGN | F_BLANK)))
14408 {
14409 /* Fast path */
14410 if (formatfloat(v, arg, NULL, writer) == -1)
14411 return -1;
14412 return 1;
14413 }
14414
14415 arg->sign = 1;
14416 if (formatfloat(v, arg, p_str, NULL) == -1)
14417 return -1;
14418 break;
14419
14420 case 'c':
14421 {
14422 Py_UCS4 ch = formatchar(v);
14423 if (ch == (Py_UCS4) -1)
14424 return -1;
14425 if (arg->width == -1 && arg->prec == -1) {
14426 /* Fast path */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014427 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014428 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014429 return 1;
14430 }
14431 *p_str = PyUnicode_FromOrdinal(ch);
14432 break;
14433 }
14434
14435 default:
14436 PyErr_Format(PyExc_ValueError,
14437 "unsupported format character '%c' (0x%x) "
14438 "at index %zd",
14439 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
14440 (int)arg->ch,
14441 ctx->fmtpos - 1);
14442 return -1;
14443 }
14444 if (*p_str == NULL)
14445 return -1;
14446 assert (PyUnicode_Check(*p_str));
14447 return 0;
14448}
14449
14450static int
14451unicode_format_arg_output(struct unicode_formatter_t *ctx,
14452 struct unicode_format_arg_t *arg,
14453 PyObject *str)
14454{
14455 Py_ssize_t len;
14456 enum PyUnicode_Kind kind;
14457 void *pbuf;
14458 Py_ssize_t pindex;
14459 Py_UCS4 signchar;
14460 Py_ssize_t buflen;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014461 Py_UCS4 maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014462 Py_ssize_t sublen;
14463 _PyUnicodeWriter *writer = &ctx->writer;
14464 Py_UCS4 fill;
14465
14466 fill = ' ';
14467 if (arg->sign && arg->flags & F_ZERO)
14468 fill = '0';
14469
14470 if (PyUnicode_READY(str) == -1)
14471 return -1;
14472
14473 len = PyUnicode_GET_LENGTH(str);
14474 if ((arg->width == -1 || arg->width <= len)
14475 && (arg->prec == -1 || arg->prec >= len)
14476 && !(arg->flags & (F_SIGN | F_BLANK)))
14477 {
14478 /* Fast path */
14479 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
14480 return -1;
14481 return 0;
14482 }
14483
14484 /* Truncate the string for "s", "r" and "a" formats
14485 if the precision is set */
14486 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
14487 if (arg->prec >= 0 && len > arg->prec)
14488 len = arg->prec;
14489 }
14490
14491 /* Adjust sign and width */
14492 kind = PyUnicode_KIND(str);
14493 pbuf = PyUnicode_DATA(str);
14494 pindex = 0;
14495 signchar = '\0';
14496 if (arg->sign) {
14497 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
14498 if (ch == '-' || ch == '+') {
14499 signchar = ch;
14500 len--;
14501 pindex++;
14502 }
14503 else if (arg->flags & F_SIGN)
14504 signchar = '+';
14505 else if (arg->flags & F_BLANK)
14506 signchar = ' ';
14507 else
14508 arg->sign = 0;
14509 }
14510 if (arg->width < len)
14511 arg->width = len;
14512
14513 /* Prepare the writer */
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014514 maxchar = writer->maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014515 if (!(arg->flags & F_LJUST)) {
14516 if (arg->sign) {
14517 if ((arg->width-1) > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014518 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014519 }
14520 else {
14521 if (arg->width > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014522 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014523 }
14524 }
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014525 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
14526 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014527 maxchar = Py_MAX(maxchar, strmaxchar);
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014528 }
14529
Victor Stinnera47082312012-10-04 02:19:54 +020014530 buflen = arg->width;
14531 if (arg->sign && len == arg->width)
14532 buflen++;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014533 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
Victor Stinnera47082312012-10-04 02:19:54 +020014534 return -1;
14535
14536 /* Write the sign if needed */
14537 if (arg->sign) {
14538 if (fill != ' ') {
14539 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14540 writer->pos += 1;
14541 }
14542 if (arg->width > len)
14543 arg->width--;
14544 }
14545
14546 /* Write the numeric prefix for "x", "X" and "o" formats
14547 if the alternate form is used.
14548 For example, write "0x" for the "%#x" format. */
14549 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14550 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14551 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
14552 if (fill != ' ') {
14553 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14554 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14555 writer->pos += 2;
14556 pindex += 2;
14557 }
14558 arg->width -= 2;
14559 if (arg->width < 0)
14560 arg->width = 0;
14561 len -= 2;
14562 }
14563
14564 /* Pad left with the fill character if needed */
14565 if (arg->width > len && !(arg->flags & F_LJUST)) {
14566 sublen = arg->width - len;
14567 FILL(writer->kind, writer->data, fill, writer->pos, sublen);
14568 writer->pos += sublen;
14569 arg->width = len;
14570 }
14571
14572 /* If padding with spaces: write sign if needed and/or numeric prefix if
14573 the alternate form is used */
14574 if (fill == ' ') {
14575 if (arg->sign) {
14576 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14577 writer->pos += 1;
14578 }
14579 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14580 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14581 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
14582 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14583 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14584 writer->pos += 2;
14585 pindex += 2;
14586 }
14587 }
14588
14589 /* Write characters */
14590 if (len) {
14591 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14592 str, pindex, len);
14593 writer->pos += len;
14594 }
14595
14596 /* Pad right with the fill character if needed */
14597 if (arg->width > len) {
14598 sublen = arg->width - len;
14599 FILL(writer->kind, writer->data, ' ', writer->pos, sublen);
14600 writer->pos += sublen;
14601 }
14602 return 0;
14603}
14604
14605/* Helper of PyUnicode_Format(): format one arg.
14606 Return 0 on success, raise an exception and return -1 on error. */
14607static int
14608unicode_format_arg(struct unicode_formatter_t *ctx)
14609{
14610 struct unicode_format_arg_t arg;
14611 PyObject *str;
14612 int ret;
14613
Victor Stinner8dbd4212012-12-04 09:30:24 +010014614 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
14615 arg.flags = 0;
14616 arg.width = -1;
14617 arg.prec = -1;
14618 arg.sign = 0;
14619 str = NULL;
14620
Victor Stinnera47082312012-10-04 02:19:54 +020014621 ret = unicode_format_arg_parse(ctx, &arg);
14622 if (ret == -1)
14623 return -1;
14624
14625 ret = unicode_format_arg_format(ctx, &arg, &str);
14626 if (ret == -1)
14627 return -1;
14628
14629 if (ret != 1) {
14630 ret = unicode_format_arg_output(ctx, &arg, str);
14631 Py_DECREF(str);
14632 if (ret == -1)
14633 return -1;
14634 }
14635
14636 if (ctx->dict && (ctx->argidx < ctx->arglen) && arg.ch != '%') {
14637 PyErr_SetString(PyExc_TypeError,
14638 "not all arguments converted during string formatting");
14639 return -1;
14640 }
14641 return 0;
14642}
14643
Alexander Belopolsky40018472011-02-26 01:02:56 +000014644PyObject *
14645PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014646{
Victor Stinnera47082312012-10-04 02:19:54 +020014647 struct unicode_formatter_t ctx;
Tim Petersced69f82003-09-16 20:30:58 +000014648
Guido van Rossumd57fd912000-03-10 22:53:23 +000014649 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014650 PyErr_BadInternalCall();
14651 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014652 }
Victor Stinnera47082312012-10-04 02:19:54 +020014653
14654 ctx.fmtstr = PyUnicode_FromObject(format);
14655 if (ctx.fmtstr == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000014656 return NULL;
Victor Stinnera47082312012-10-04 02:19:54 +020014657 if (PyUnicode_READY(ctx.fmtstr) == -1) {
14658 Py_DECREF(ctx.fmtstr);
14659 return NULL;
14660 }
14661 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
14662 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
14663 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
14664 ctx.fmtpos = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014665
Victor Stinner8f674cc2013-04-17 23:02:17 +020014666 _PyUnicodeWriter_Init(&ctx.writer);
14667 ctx.writer.min_length = ctx.fmtcnt + 100;
14668 ctx.writer.overallocate = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014669
Guido van Rossumd57fd912000-03-10 22:53:23 +000014670 if (PyTuple_Check(args)) {
Victor Stinnera47082312012-10-04 02:19:54 +020014671 ctx.arglen = PyTuple_Size(args);
14672 ctx.argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014673 }
14674 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014675 ctx.arglen = -1;
14676 ctx.argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014677 }
Victor Stinnera47082312012-10-04 02:19:54 +020014678 ctx.args_owned = 0;
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040014679 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Victor Stinnera47082312012-10-04 02:19:54 +020014680 ctx.dict = args;
14681 else
14682 ctx.dict = NULL;
14683 ctx.args = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014684
Victor Stinnera47082312012-10-04 02:19:54 +020014685 while (--ctx.fmtcnt >= 0) {
14686 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
Victor Stinnercfc4c132013-04-03 01:48:39 +020014687 Py_ssize_t nonfmtpos;
Victor Stinnera47082312012-10-04 02:19:54 +020014688
14689 nonfmtpos = ctx.fmtpos++;
14690 while (ctx.fmtcnt >= 0 &&
14691 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
14692 ctx.fmtpos++;
14693 ctx.fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014694 }
Victor Stinnera47082312012-10-04 02:19:54 +020014695 if (ctx.fmtcnt < 0) {
14696 ctx.fmtpos--;
14697 ctx.writer.overallocate = 0;
Victor Stinnera0494432012-10-03 23:03:46 +020014698 }
Victor Stinneree4544c2012-05-09 22:24:08 +020014699
Victor Stinnercfc4c132013-04-03 01:48:39 +020014700 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
14701 nonfmtpos, ctx.fmtpos) < 0)
14702 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014703 }
14704 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014705 ctx.fmtpos++;
14706 if (unicode_format_arg(&ctx) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000014707 goto onError;
Victor Stinnera47082312012-10-04 02:19:54 +020014708 }
14709 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020014710
Victor Stinnera47082312012-10-04 02:19:54 +020014711 if (ctx.argidx < ctx.arglen && !ctx.dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014712 PyErr_SetString(PyExc_TypeError,
14713 "not all arguments converted during string formatting");
14714 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014715 }
14716
Victor Stinnera47082312012-10-04 02:19:54 +020014717 if (ctx.args_owned) {
14718 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014719 }
Victor Stinnera47082312012-10-04 02:19:54 +020014720 Py_DECREF(ctx.fmtstr);
14721 return _PyUnicodeWriter_Finish(&ctx.writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014722
Benjamin Peterson29060642009-01-31 22:14:21 +000014723 onError:
Victor Stinnera47082312012-10-04 02:19:54 +020014724 Py_DECREF(ctx.fmtstr);
14725 _PyUnicodeWriter_Dealloc(&ctx.writer);
14726 if (ctx.args_owned) {
14727 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014728 }
14729 return NULL;
14730}
14731
Jeremy Hylton938ace62002-07-17 16:30:39 +000014732static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000014733unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
14734
Tim Peters6d6c1a32001-08-02 04:15:00 +000014735static PyObject *
14736unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14737{
Benjamin Peterson29060642009-01-31 22:14:21 +000014738 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014739 static char *kwlist[] = {"object", "encoding", "errors", 0};
14740 char *encoding = NULL;
14741 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000014742
Benjamin Peterson14339b62009-01-31 16:36:08 +000014743 if (type != &PyUnicode_Type)
14744 return unicode_subtype_new(type, args, kwds);
14745 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000014746 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000014747 return NULL;
14748 if (x == NULL)
Serhiy Storchaka678db842013-01-26 12:16:36 +020014749 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson14339b62009-01-31 16:36:08 +000014750 if (encoding == NULL && errors == NULL)
14751 return PyObject_Str(x);
14752 else
Benjamin Peterson29060642009-01-31 22:14:21 +000014753 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000014754}
14755
Guido van Rossume023fe02001-08-30 03:12:59 +000014756static PyObject *
14757unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14758{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014759 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014760 Py_ssize_t length, char_size;
14761 int share_wstr, share_utf8;
14762 unsigned int kind;
14763 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000014764
Benjamin Peterson14339b62009-01-31 16:36:08 +000014765 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014766
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014767 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014768 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014769 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014770 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050014771 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060014772 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014773 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060014774 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014775
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014776 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014777 if (self == NULL) {
14778 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014779 return NULL;
14780 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014781 kind = PyUnicode_KIND(unicode);
14782 length = PyUnicode_GET_LENGTH(unicode);
14783
14784 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014785#ifdef Py_DEBUG
14786 _PyUnicode_HASH(self) = -1;
14787#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014788 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014789#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014790 _PyUnicode_STATE(self).interned = 0;
14791 _PyUnicode_STATE(self).kind = kind;
14792 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020014793 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014794 _PyUnicode_STATE(self).ready = 1;
14795 _PyUnicode_WSTR(self) = NULL;
14796 _PyUnicode_UTF8_LENGTH(self) = 0;
14797 _PyUnicode_UTF8(self) = NULL;
14798 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020014799 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014800
14801 share_utf8 = 0;
14802 share_wstr = 0;
14803 if (kind == PyUnicode_1BYTE_KIND) {
14804 char_size = 1;
14805 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
14806 share_utf8 = 1;
14807 }
14808 else if (kind == PyUnicode_2BYTE_KIND) {
14809 char_size = 2;
14810 if (sizeof(wchar_t) == 2)
14811 share_wstr = 1;
14812 }
14813 else {
14814 assert(kind == PyUnicode_4BYTE_KIND);
14815 char_size = 4;
14816 if (sizeof(wchar_t) == 4)
14817 share_wstr = 1;
14818 }
14819
14820 /* Ensure we won't overflow the length. */
14821 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
14822 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014823 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014824 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014825 data = PyObject_MALLOC((length + 1) * char_size);
14826 if (data == NULL) {
14827 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014828 goto onError;
14829 }
14830
Victor Stinnerc3c74152011-10-02 20:39:55 +020014831 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014832 if (share_utf8) {
14833 _PyUnicode_UTF8_LENGTH(self) = length;
14834 _PyUnicode_UTF8(self) = data;
14835 }
14836 if (share_wstr) {
14837 _PyUnicode_WSTR_LENGTH(self) = length;
14838 _PyUnicode_WSTR(self) = (wchar_t *)data;
14839 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014840
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014841 Py_MEMCPY(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020014842 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014843 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014844#ifdef Py_DEBUG
14845 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
14846#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020014847 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010014848 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014849
14850onError:
14851 Py_DECREF(unicode);
14852 Py_DECREF(self);
14853 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000014854}
14855
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000014856PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070014857"str(object='') -> str\n\
14858str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000014859\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100014860Create a new string object from the given object. If encoding or\n\
14861errors is specified, then the object must expose a data buffer\n\
14862that will be decoded using the given encoding and error handler.\n\
14863Otherwise, returns the result of object.__str__() (if defined)\n\
14864or repr(object).\n\
14865encoding defaults to sys.getdefaultencoding().\n\
14866errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000014867
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014868static PyObject *unicode_iter(PyObject *seq);
14869
Guido van Rossumd57fd912000-03-10 22:53:23 +000014870PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000014871 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014872 "str", /* tp_name */
14873 sizeof(PyUnicodeObject), /* tp_size */
14874 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014875 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014876 (destructor)unicode_dealloc, /* tp_dealloc */
14877 0, /* tp_print */
14878 0, /* tp_getattr */
14879 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014880 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014881 unicode_repr, /* tp_repr */
14882 &unicode_as_number, /* tp_as_number */
14883 &unicode_as_sequence, /* tp_as_sequence */
14884 &unicode_as_mapping, /* tp_as_mapping */
14885 (hashfunc) unicode_hash, /* tp_hash*/
14886 0, /* tp_call*/
14887 (reprfunc) unicode_str, /* tp_str */
14888 PyObject_GenericGetAttr, /* tp_getattro */
14889 0, /* tp_setattro */
14890 0, /* tp_as_buffer */
14891 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000014892 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014893 unicode_doc, /* tp_doc */
14894 0, /* tp_traverse */
14895 0, /* tp_clear */
14896 PyUnicode_RichCompare, /* tp_richcompare */
14897 0, /* tp_weaklistoffset */
14898 unicode_iter, /* tp_iter */
14899 0, /* tp_iternext */
14900 unicode_methods, /* tp_methods */
14901 0, /* tp_members */
14902 0, /* tp_getset */
14903 &PyBaseObject_Type, /* tp_base */
14904 0, /* tp_dict */
14905 0, /* tp_descr_get */
14906 0, /* tp_descr_set */
14907 0, /* tp_dictoffset */
14908 0, /* tp_init */
14909 0, /* tp_alloc */
14910 unicode_new, /* tp_new */
14911 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014912};
14913
14914/* Initialize the Unicode implementation */
14915
Victor Stinner3a50e702011-10-18 21:21:00 +020014916int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014917{
Thomas Wouters477c8d52006-05-27 19:21:47 +000014918 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014919 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000014920 0x000A, /* LINE FEED */
14921 0x000D, /* CARRIAGE RETURN */
14922 0x001C, /* FILE SEPARATOR */
14923 0x001D, /* GROUP SEPARATOR */
14924 0x001E, /* RECORD SEPARATOR */
14925 0x0085, /* NEXT LINE */
14926 0x2028, /* LINE SEPARATOR */
14927 0x2029, /* PARAGRAPH SEPARATOR */
14928 };
14929
Fred Drakee4315f52000-05-09 19:53:39 +000014930 /* Init the implementation */
Serhiy Storchaka678db842013-01-26 12:16:36 +020014931 _Py_INCREF_UNICODE_EMPTY();
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014932 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014933 Py_FatalError("Can't create empty string");
Serhiy Storchaka678db842013-01-26 12:16:36 +020014934 Py_DECREF(unicode_empty);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014935
Guido van Rossumcacfc072002-05-24 19:01:59 +000014936 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014937 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000014938
14939 /* initialize the linebreak bloom filter */
14940 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014941 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020014942 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014943
Christian Heimes26532f72013-07-20 14:57:16 +020014944 if (PyType_Ready(&EncodingMapType) < 0)
14945 Py_FatalError("Can't initialize encoding map type");
Victor Stinner3a50e702011-10-18 21:21:00 +020014946
Benjamin Petersonc4311282012-10-30 23:21:10 -040014947 if (PyType_Ready(&PyFieldNameIter_Type) < 0)
14948 Py_FatalError("Can't initialize field name iterator type");
14949
14950 if (PyType_Ready(&PyFormatterIter_Type) < 0)
14951 Py_FatalError("Can't initialize formatter iter type");
Benjamin Petersone8ea97f2012-10-30 23:27:52 -040014952
Victor Stinner3a50e702011-10-18 21:21:00 +020014953#ifdef HAVE_MBCS
14954 winver.dwOSVersionInfoSize = sizeof(winver);
14955 if (!GetVersionEx((OSVERSIONINFO*)&winver)) {
14956 PyErr_SetFromWindowsErr(0);
14957 return -1;
14958 }
14959#endif
14960 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014961}
14962
14963/* Finalize the Unicode implementation */
14964
Christian Heimesa156e092008-02-16 07:38:31 +000014965int
14966PyUnicode_ClearFreeList(void)
14967{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014968 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000014969}
14970
Guido van Rossumd57fd912000-03-10 22:53:23 +000014971void
Thomas Wouters78890102000-07-22 19:25:51 +000014972_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014973{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014974 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014975
Serhiy Storchaka05997252013-01-26 12:14:02 +020014976 Py_CLEAR(unicode_empty);
Barry Warsaw5b4c2282000-10-03 20:45:26 +000014977
Serhiy Storchaka05997252013-01-26 12:14:02 +020014978 for (i = 0; i < 256; i++)
14979 Py_CLEAR(unicode_latin1[i]);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020014980 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000014981 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000014982}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000014983
Walter Dörwald16807132007-05-25 13:52:07 +000014984void
14985PyUnicode_InternInPlace(PyObject **p)
14986{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020014987 PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014988 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020014989#ifdef Py_DEBUG
14990 assert(s != NULL);
14991 assert(_PyUnicode_CHECK(s));
14992#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000014993 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020014994 return;
14995#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000014996 /* If it's a subclass, we don't really know what putting
14997 it in the interned dict might do. */
14998 if (!PyUnicode_CheckExact(s))
14999 return;
15000 if (PyUnicode_CHECK_INTERNED(s))
15001 return;
15002 if (interned == NULL) {
15003 interned = PyDict_New();
15004 if (interned == NULL) {
15005 PyErr_Clear(); /* Don't leave an exception */
15006 return;
15007 }
15008 }
15009 /* It might be that the GetItem call fails even
15010 though the key is present in the dictionary,
15011 namely when this happens during a stack overflow. */
15012 Py_ALLOW_RECURSION
Victor Stinner7931d9a2011-11-04 00:22:48 +010015013 t = PyDict_GetItem(interned, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015014 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000015015
Victor Stinnerf0335102013-04-14 19:13:03 +020015016 if (t) {
15017 Py_INCREF(t);
15018 Py_DECREF(*p);
15019 *p = t;
15020 return;
15021 }
Walter Dörwald16807132007-05-25 13:52:07 +000015022
Benjamin Peterson14339b62009-01-31 16:36:08 +000015023 PyThreadState_GET()->recursion_critical = 1;
Victor Stinner7931d9a2011-11-04 00:22:48 +010015024 if (PyDict_SetItem(interned, s, s) < 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015025 PyErr_Clear();
15026 PyThreadState_GET()->recursion_critical = 0;
15027 return;
15028 }
15029 PyThreadState_GET()->recursion_critical = 0;
15030 /* The two references in interned are not counted by refcnt.
15031 The deallocator will take care of this */
15032 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015033 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000015034}
15035
15036void
15037PyUnicode_InternImmortal(PyObject **p)
15038{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015039 PyUnicode_InternInPlace(p);
15040 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020015041 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015042 Py_INCREF(*p);
15043 }
Walter Dörwald16807132007-05-25 13:52:07 +000015044}
15045
15046PyObject *
15047PyUnicode_InternFromString(const char *cp)
15048{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015049 PyObject *s = PyUnicode_FromString(cp);
15050 if (s == NULL)
15051 return NULL;
15052 PyUnicode_InternInPlace(&s);
15053 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000015054}
15055
Alexander Belopolsky40018472011-02-26 01:02:56 +000015056void
15057_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000015058{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015059 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015060 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015061 Py_ssize_t i, n;
15062 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000015063
Benjamin Peterson14339b62009-01-31 16:36:08 +000015064 if (interned == NULL || !PyDict_Check(interned))
15065 return;
15066 keys = PyDict_Keys(interned);
15067 if (keys == NULL || !PyList_Check(keys)) {
15068 PyErr_Clear();
15069 return;
15070 }
Walter Dörwald16807132007-05-25 13:52:07 +000015071
Benjamin Peterson14339b62009-01-31 16:36:08 +000015072 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
15073 detector, interned unicode strings are not forcibly deallocated;
15074 rather, we give them their stolen references back, and then clear
15075 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000015076
Benjamin Peterson14339b62009-01-31 16:36:08 +000015077 n = PyList_GET_SIZE(keys);
15078 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000015079 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015080 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015081 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015082 if (PyUnicode_READY(s) == -1) {
15083 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015084 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015085 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015086 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015087 case SSTATE_NOT_INTERNED:
15088 /* XXX Shouldn't happen */
15089 break;
15090 case SSTATE_INTERNED_IMMORTAL:
15091 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015092 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015093 break;
15094 case SSTATE_INTERNED_MORTAL:
15095 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015096 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015097 break;
15098 default:
15099 Py_FatalError("Inconsistent interned string state.");
15100 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015101 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015102 }
15103 fprintf(stderr, "total size of all interned strings: "
15104 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
15105 "mortal/immortal\n", mortal_size, immortal_size);
15106 Py_DECREF(keys);
15107 PyDict_Clear(interned);
Serhiy Storchaka05997252013-01-26 12:14:02 +020015108 Py_CLEAR(interned);
Walter Dörwald16807132007-05-25 13:52:07 +000015109}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015110
15111
15112/********************* Unicode Iterator **************************/
15113
15114typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015115 PyObject_HEAD
15116 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015117 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015118} unicodeiterobject;
15119
15120static void
15121unicodeiter_dealloc(unicodeiterobject *it)
15122{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015123 _PyObject_GC_UNTRACK(it);
15124 Py_XDECREF(it->it_seq);
15125 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015126}
15127
15128static int
15129unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15130{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015131 Py_VISIT(it->it_seq);
15132 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015133}
15134
15135static PyObject *
15136unicodeiter_next(unicodeiterobject *it)
15137{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015138 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015139
Benjamin Peterson14339b62009-01-31 16:36:08 +000015140 assert(it != NULL);
15141 seq = it->it_seq;
15142 if (seq == NULL)
15143 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015144 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015145
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015146 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15147 int kind = PyUnicode_KIND(seq);
15148 void *data = PyUnicode_DATA(seq);
15149 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15150 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015151 if (item != NULL)
15152 ++it->it_index;
15153 return item;
15154 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015155
Benjamin Peterson14339b62009-01-31 16:36:08 +000015156 Py_DECREF(seq);
15157 it->it_seq = NULL;
15158 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015159}
15160
15161static PyObject *
15162unicodeiter_len(unicodeiterobject *it)
15163{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015164 Py_ssize_t len = 0;
15165 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020015166 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015167 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015168}
15169
15170PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15171
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015172static PyObject *
15173unicodeiter_reduce(unicodeiterobject *it)
15174{
15175 if (it->it_seq != NULL) {
Antoine Pitroua7013882012-04-05 00:04:20 +020015176 return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015177 it->it_seq, it->it_index);
15178 } else {
15179 PyObject *u = PyUnicode_FromUnicode(NULL, 0);
15180 if (u == NULL)
15181 return NULL;
Antoine Pitroua7013882012-04-05 00:04:20 +020015182 return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015183 }
15184}
15185
15186PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15187
15188static PyObject *
15189unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15190{
15191 Py_ssize_t index = PyLong_AsSsize_t(state);
15192 if (index == -1 && PyErr_Occurred())
15193 return NULL;
15194 if (index < 0)
15195 index = 0;
15196 it->it_index = index;
15197 Py_RETURN_NONE;
15198}
15199
15200PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15201
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015202static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015203 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000015204 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015205 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15206 reduce_doc},
15207 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
15208 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000015209 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015210};
15211
15212PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015213 PyVarObject_HEAD_INIT(&PyType_Type, 0)
15214 "str_iterator", /* tp_name */
15215 sizeof(unicodeiterobject), /* tp_basicsize */
15216 0, /* tp_itemsize */
15217 /* methods */
15218 (destructor)unicodeiter_dealloc, /* tp_dealloc */
15219 0, /* tp_print */
15220 0, /* tp_getattr */
15221 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000015222 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015223 0, /* tp_repr */
15224 0, /* tp_as_number */
15225 0, /* tp_as_sequence */
15226 0, /* tp_as_mapping */
15227 0, /* tp_hash */
15228 0, /* tp_call */
15229 0, /* tp_str */
15230 PyObject_GenericGetAttr, /* tp_getattro */
15231 0, /* tp_setattro */
15232 0, /* tp_as_buffer */
15233 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15234 0, /* tp_doc */
15235 (traverseproc)unicodeiter_traverse, /* tp_traverse */
15236 0, /* tp_clear */
15237 0, /* tp_richcompare */
15238 0, /* tp_weaklistoffset */
15239 PyObject_SelfIter, /* tp_iter */
15240 (iternextfunc)unicodeiter_next, /* tp_iternext */
15241 unicodeiter_methods, /* tp_methods */
15242 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015243};
15244
15245static PyObject *
15246unicode_iter(PyObject *seq)
15247{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015248 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015249
Benjamin Peterson14339b62009-01-31 16:36:08 +000015250 if (!PyUnicode_Check(seq)) {
15251 PyErr_BadInternalCall();
15252 return NULL;
15253 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015254 if (PyUnicode_READY(seq) == -1)
15255 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015256 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15257 if (it == NULL)
15258 return NULL;
15259 it->it_index = 0;
15260 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015261 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015262 _PyObject_GC_TRACK(it);
15263 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015264}
15265
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015266
15267size_t
15268Py_UNICODE_strlen(const Py_UNICODE *u)
15269{
15270 int res = 0;
15271 while(*u++)
15272 res++;
15273 return res;
15274}
15275
15276Py_UNICODE*
15277Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
15278{
15279 Py_UNICODE *u = s1;
15280 while ((*u++ = *s2++));
15281 return s1;
15282}
15283
15284Py_UNICODE*
15285Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15286{
15287 Py_UNICODE *u = s1;
15288 while ((*u++ = *s2++))
15289 if (n-- == 0)
15290 break;
15291 return s1;
15292}
15293
15294Py_UNICODE*
15295Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
15296{
15297 Py_UNICODE *u1 = s1;
15298 u1 += Py_UNICODE_strlen(u1);
15299 Py_UNICODE_strcpy(u1, s2);
15300 return s1;
15301}
15302
15303int
15304Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
15305{
15306 while (*s1 && *s2 && *s1 == *s2)
15307 s1++, s2++;
15308 if (*s1 && *s2)
15309 return (*s1 < *s2) ? -1 : +1;
15310 if (*s1)
15311 return 1;
15312 if (*s2)
15313 return -1;
15314 return 0;
15315}
15316
15317int
15318Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15319{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015320 Py_UNICODE u1, u2;
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015321 for (; n != 0; n--) {
15322 u1 = *s1;
15323 u2 = *s2;
15324 if (u1 != u2)
15325 return (u1 < u2) ? -1 : +1;
15326 if (u1 == '\0')
15327 return 0;
15328 s1++;
15329 s2++;
15330 }
15331 return 0;
15332}
15333
15334Py_UNICODE*
15335Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
15336{
15337 const Py_UNICODE *p;
15338 for (p = s; *p; p++)
15339 if (*p == c)
15340 return (Py_UNICODE*)p;
15341 return NULL;
15342}
15343
15344Py_UNICODE*
15345Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
15346{
15347 const Py_UNICODE *p;
15348 p = s + Py_UNICODE_strlen(s);
15349 while (p != s) {
15350 p--;
15351 if (*p == c)
15352 return (Py_UNICODE*)p;
15353 }
15354 return NULL;
15355}
Victor Stinner331ea922010-08-10 16:37:20 +000015356
Victor Stinner71133ff2010-09-01 23:43:53 +000015357Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015358PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000015359{
Victor Stinner577db2c2011-10-11 22:12:48 +020015360 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015361 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000015362
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015363 if (!PyUnicode_Check(unicode)) {
15364 PyErr_BadArgument();
15365 return NULL;
15366 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015367 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020015368 if (u == NULL)
15369 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000015370 /* Ensure we won't overflow the size. */
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015371 if (len > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000015372 PyErr_NoMemory();
15373 return NULL;
15374 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015375 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000015376 size *= sizeof(Py_UNICODE);
15377 copy = PyMem_Malloc(size);
15378 if (copy == NULL) {
15379 PyErr_NoMemory();
15380 return NULL;
15381 }
Victor Stinner577db2c2011-10-11 22:12:48 +020015382 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000015383 return copy;
15384}
Martin v. Löwis5b222132007-06-10 09:51:05 +000015385
Georg Brandl66c221e2010-10-14 07:04:07 +000015386/* A _string module, to export formatter_parser and formatter_field_name_split
15387 to the string.Formatter class implemented in Python. */
15388
15389static PyMethodDef _string_methods[] = {
15390 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
15391 METH_O, PyDoc_STR("split the argument as a field name")},
15392 {"formatter_parser", (PyCFunction) formatter_parser,
15393 METH_O, PyDoc_STR("parse the argument as a format string")},
15394 {NULL, NULL}
15395};
15396
15397static struct PyModuleDef _string_module = {
15398 PyModuleDef_HEAD_INIT,
15399 "_string",
15400 PyDoc_STR("string helper module"),
15401 0,
15402 _string_methods,
15403 NULL,
15404 NULL,
15405 NULL,
15406 NULL
15407};
15408
15409PyMODINIT_FUNC
15410PyInit__string(void)
15411{
15412 return PyModule_Create(&_string_module);
15413}
15414
15415
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000015416#ifdef __cplusplus
15417}
15418#endif