blob: 994c4f570f7f27a958bbefc5fae7e8414c0a1173 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000046#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000047#include <windows.h>
48#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000049
Larry Hastings61272b72014-01-07 12:41:53 -080050/*[clinic input]
Larry Hastingsc2047262014-01-25 20:43:29 -080051class str "PyUnicodeObject *" "&PyUnicode_Type"
Larry Hastings61272b72014-01-07 12:41:53 -080052[clinic start generated code]*/
Larry Hastings581ee362014-01-28 05:00:08 -080053/*[clinic end generated code: output=da39a3ee5e6b4b0d input=604e916854800fa8]*/
Larry Hastings44e2eaa2013-11-23 15:37:55 -080054
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000055/* --- Globals ------------------------------------------------------------
56
Serhiy Storchaka05997252013-01-26 12:14:02 +020057NOTE: In the interpreter's initialization phase, some globals are currently
58 initialized dynamically as needed. In the process Unicode objects may
59 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000060
61*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000062
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000063
64#ifdef __cplusplus
65extern "C" {
66#endif
67
Victor Stinner8faf8212011-12-08 22:14:11 +010068/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
69#define MAX_UNICODE 0x10ffff
70
Victor Stinner910337b2011-10-03 03:20:16 +020071#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020072# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020073#else
74# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
75#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020076
Victor Stinnere90fe6a2011-10-01 16:48:13 +020077#define _PyUnicode_UTF8(op) \
78 (((PyCompactUnicodeObject*)(op))->utf8)
79#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020080 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020081 assert(PyUnicode_IS_READY(op)), \
82 PyUnicode_IS_COMPACT_ASCII(op) ? \
83 ((char*)((PyASCIIObject*)(op) + 1)) : \
84 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +020085#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020086 (((PyCompactUnicodeObject*)(op))->utf8_length)
87#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020088 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020089 assert(PyUnicode_IS_READY(op)), \
90 PyUnicode_IS_COMPACT_ASCII(op) ? \
91 ((PyASCIIObject*)(op))->length : \
92 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +020093#define _PyUnicode_WSTR(op) \
94 (((PyASCIIObject*)(op))->wstr)
95#define _PyUnicode_WSTR_LENGTH(op) \
96 (((PyCompactUnicodeObject*)(op))->wstr_length)
97#define _PyUnicode_LENGTH(op) \
98 (((PyASCIIObject *)(op))->length)
99#define _PyUnicode_STATE(op) \
100 (((PyASCIIObject *)(op))->state)
101#define _PyUnicode_HASH(op) \
102 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200103#define _PyUnicode_KIND(op) \
104 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200105 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200106#define _PyUnicode_GET_LENGTH(op) \
107 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200108 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200109#define _PyUnicode_DATA_ANY(op) \
110 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200111
Victor Stinner910337b2011-10-03 03:20:16 +0200112#undef PyUnicode_READY
113#define PyUnicode_READY(op) \
114 (assert(_PyUnicode_CHECK(op)), \
115 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200116 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100117 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200118
Victor Stinnerc379ead2011-10-03 12:52:27 +0200119#define _PyUnicode_SHARE_UTF8(op) \
120 (assert(_PyUnicode_CHECK(op)), \
121 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
122 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
123#define _PyUnicode_SHARE_WSTR(op) \
124 (assert(_PyUnicode_CHECK(op)), \
125 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
126
Victor Stinner829c0ad2011-10-03 01:08:02 +0200127/* true if the Unicode object has an allocated UTF-8 memory block
128 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200129#define _PyUnicode_HAS_UTF8_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200130 ((!PyUnicode_IS_COMPACT_ASCII(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200131 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200132 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
133
Victor Stinner03490912011-10-03 23:45:12 +0200134/* true if the Unicode object has an allocated wstr memory block
135 (not shared with other data) */
136#define _PyUnicode_HAS_WSTR_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200137 ((_PyUnicode_WSTR(op) && \
Victor Stinner03490912011-10-03 23:45:12 +0200138 (!PyUnicode_IS_READY(op) || \
139 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
140
Victor Stinner910337b2011-10-03 03:20:16 +0200141/* Generic helper macro to convert characters of different types.
142 from_type and to_type have to be valid type names, begin and end
143 are pointers to the source characters which should be of type
144 "from_type *". to is a pointer of type "to_type *" and points to the
145 buffer where the result characters are written to. */
146#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
147 do { \
Victor Stinner4a587072013-11-19 12:54:53 +0100148 to_type *_to = (to_type *)(to); \
149 const from_type *_iter = (from_type *)(begin); \
150 const from_type *_end = (from_type *)(end); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200151 Py_ssize_t n = (_end) - (_iter); \
152 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200153 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200154 while (_iter < (_unrolled_end)) { \
155 _to[0] = (to_type) _iter[0]; \
156 _to[1] = (to_type) _iter[1]; \
157 _to[2] = (to_type) _iter[2]; \
158 _to[3] = (to_type) _iter[3]; \
159 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200160 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200161 while (_iter < (_end)) \
162 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200163 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200164
Walter Dörwald16807132007-05-25 13:52:07 +0000165/* This dictionary holds all interned unicode strings. Note that references
166 to strings in this dictionary are *not* counted in the string's ob_refcnt.
167 When the interned string reaches a refcnt of 0 the string deallocation
168 function will delete the reference from this dictionary.
169
170 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000171 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000172*/
Serhiy Storchaka05997252013-01-26 12:14:02 +0200173static PyObject *interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +0000174
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000175/* The empty Unicode object is shared to improve performance. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200176static PyObject *unicode_empty = NULL;
Serhiy Storchaka05997252013-01-26 12:14:02 +0200177
Serhiy Storchaka678db842013-01-26 12:16:36 +0200178#define _Py_INCREF_UNICODE_EMPTY() \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200179 do { \
180 if (unicode_empty != NULL) \
181 Py_INCREF(unicode_empty); \
182 else { \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200183 unicode_empty = PyUnicode_New(0, 0); \
184 if (unicode_empty != NULL) { \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200185 Py_INCREF(unicode_empty); \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200186 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
187 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200188 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200189 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000190
Serhiy Storchaka678db842013-01-26 12:16:36 +0200191#define _Py_RETURN_UNICODE_EMPTY() \
192 do { \
193 _Py_INCREF_UNICODE_EMPTY(); \
194 return unicode_empty; \
195 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000196
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200197/* Forward declaration */
198Py_LOCAL_INLINE(int)
199_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
200
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200201/* List of static strings. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200202static _Py_Identifier *static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200203
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000204/* Single character Unicode strings in the Latin-1 range are being
205 shared as well. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200206static PyObject *unicode_latin1[256] = {NULL};
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000207
Christian Heimes190d79e2008-01-30 11:58:22 +0000208/* Fast detection of the most frequent whitespace characters */
209const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000210 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000211/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000212/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000213/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000214/* case 0x000C: * FORM FEED */
215/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000216 0, 1, 1, 1, 1, 1, 0, 0,
217 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000218/* case 0x001C: * FILE SEPARATOR */
219/* case 0x001D: * GROUP SEPARATOR */
220/* case 0x001E: * RECORD SEPARATOR */
221/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000222 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000223/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000224 1, 0, 0, 0, 0, 0, 0, 0,
225 0, 0, 0, 0, 0, 0, 0, 0,
226 0, 0, 0, 0, 0, 0, 0, 0,
227 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000228
Benjamin Peterson14339b62009-01-31 16:36:08 +0000229 0, 0, 0, 0, 0, 0, 0, 0,
230 0, 0, 0, 0, 0, 0, 0, 0,
231 0, 0, 0, 0, 0, 0, 0, 0,
232 0, 0, 0, 0, 0, 0, 0, 0,
233 0, 0, 0, 0, 0, 0, 0, 0,
234 0, 0, 0, 0, 0, 0, 0, 0,
235 0, 0, 0, 0, 0, 0, 0, 0,
236 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000237};
238
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200239/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200240static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200241static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100242static int unicode_modifiable(PyObject *unicode);
243
Victor Stinnerfe226c02011-10-03 03:52:20 +0200244
Alexander Belopolsky40018472011-02-26 01:02:56 +0000245static PyObject *
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100246_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200247static PyObject *
248_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
249static PyObject *
250_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
251
252static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000253unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000254 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100255 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000256 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
257
Alexander Belopolsky40018472011-02-26 01:02:56 +0000258static void
259raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300260 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100261 PyObject *unicode,
262 Py_ssize_t startpos, Py_ssize_t endpos,
263 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000264
Christian Heimes190d79e2008-01-30 11:58:22 +0000265/* Same for linebreaks */
266static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000267 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000268/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000269/* 0x000B, * LINE TABULATION */
270/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000271/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000272 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000273 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000274/* 0x001C, * FILE SEPARATOR */
275/* 0x001D, * GROUP SEPARATOR */
276/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000277 0, 0, 0, 0, 1, 1, 1, 0,
278 0, 0, 0, 0, 0, 0, 0, 0,
279 0, 0, 0, 0, 0, 0, 0, 0,
280 0, 0, 0, 0, 0, 0, 0, 0,
281 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000282
Benjamin Peterson14339b62009-01-31 16:36:08 +0000283 0, 0, 0, 0, 0, 0, 0, 0,
284 0, 0, 0, 0, 0, 0, 0, 0,
285 0, 0, 0, 0, 0, 0, 0, 0,
286 0, 0, 0, 0, 0, 0, 0, 0,
287 0, 0, 0, 0, 0, 0, 0, 0,
288 0, 0, 0, 0, 0, 0, 0, 0,
289 0, 0, 0, 0, 0, 0, 0, 0,
290 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000291};
292
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300293/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
294 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000295Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000296PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000297{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000298#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000299 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000300#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000301 /* This is actually an illegal character, so it should
302 not be passed to unichr. */
303 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000304#endif
305}
306
Victor Stinner910337b2011-10-03 03:20:16 +0200307#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200308int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100309_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200310{
311 PyASCIIObject *ascii;
312 unsigned int kind;
313
314 assert(PyUnicode_Check(op));
315
316 ascii = (PyASCIIObject *)op;
317 kind = ascii->state.kind;
318
Victor Stinnera3b334d2011-10-03 13:53:37 +0200319 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200320 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200321 assert(ascii->state.ready == 1);
322 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200323 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200324 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200325 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200326
Victor Stinnera41463c2011-10-04 01:05:08 +0200327 if (ascii->state.compact == 1) {
328 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200329 assert(kind == PyUnicode_1BYTE_KIND
330 || kind == PyUnicode_2BYTE_KIND
331 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200332 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200333 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200334 assert (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100335 }
336 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200337 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
338
339 data = unicode->data.any;
340 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnere30c0a12011-11-04 20:54:05 +0100341 assert(ascii->length == 0);
342 assert(ascii->hash == -1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200343 assert(ascii->state.compact == 0);
344 assert(ascii->state.ascii == 0);
345 assert(ascii->state.ready == 0);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100346 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
Victor Stinnera41463c2011-10-04 01:05:08 +0200347 assert(ascii->wstr != NULL);
348 assert(data == NULL);
349 assert(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200350 }
351 else {
352 assert(kind == PyUnicode_1BYTE_KIND
353 || kind == PyUnicode_2BYTE_KIND
354 || kind == PyUnicode_4BYTE_KIND);
355 assert(ascii->state.compact == 0);
356 assert(ascii->state.ready == 1);
357 assert(data != NULL);
358 if (ascii->state.ascii) {
359 assert (compact->utf8 == data);
360 assert (compact->utf8_length == ascii->length);
361 }
362 else
363 assert (compact->utf8 != data);
364 }
365 }
366 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200367 if (
368#if SIZEOF_WCHAR_T == 2
369 kind == PyUnicode_2BYTE_KIND
370#else
371 kind == PyUnicode_4BYTE_KIND
372#endif
373 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200374 {
375 assert(ascii->wstr == data);
376 assert(compact->wstr_length == ascii->length);
377 } else
378 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200379 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200380
381 if (compact->utf8 == NULL)
382 assert(compact->utf8_length == 0);
383 if (ascii->wstr == NULL)
384 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200385 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200386 /* check that the best kind is used */
387 if (check_content && kind != PyUnicode_WCHAR_KIND)
388 {
389 Py_ssize_t i;
390 Py_UCS4 maxchar = 0;
Victor Stinner718fbf02012-04-26 00:39:37 +0200391 void *data;
392 Py_UCS4 ch;
393
394 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200395 for (i=0; i < ascii->length; i++)
396 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200397 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200398 if (ch > maxchar)
399 maxchar = ch;
400 }
401 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100402 if (ascii->state.ascii == 0) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200403 assert(maxchar >= 128);
Victor Stinner77faf692011-11-20 18:56:05 +0100404 assert(maxchar <= 255);
405 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200406 else
407 assert(maxchar < 128);
408 }
Victor Stinner77faf692011-11-20 18:56:05 +0100409 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200410 assert(maxchar >= 0x100);
Victor Stinner77faf692011-11-20 18:56:05 +0100411 assert(maxchar <= 0xFFFF);
412 }
413 else {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200414 assert(maxchar >= 0x10000);
Victor Stinner8faf8212011-12-08 22:14:11 +0100415 assert(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100416 }
Victor Stinner718fbf02012-04-26 00:39:37 +0200417 assert(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200418 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400419 return 1;
420}
Victor Stinner910337b2011-10-03 03:20:16 +0200421#endif
422
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100423static PyObject*
424unicode_result_wchar(PyObject *unicode)
425{
426#ifndef Py_DEBUG
427 Py_ssize_t len;
428
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100429 len = _PyUnicode_WSTR_LENGTH(unicode);
430 if (len == 0) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100431 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200432 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100433 }
434
435 if (len == 1) {
436 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100437 if ((Py_UCS4)ch < 256) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100438 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
439 Py_DECREF(unicode);
440 return latin1_char;
441 }
442 }
443
444 if (_PyUnicode_Ready(unicode) < 0) {
Victor Stinneraa771272012-10-04 02:32:58 +0200445 Py_DECREF(unicode);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100446 return NULL;
447 }
448#else
Victor Stinneraa771272012-10-04 02:32:58 +0200449 assert(Py_REFCNT(unicode) == 1);
450
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100451 /* don't make the result ready in debug mode to ensure that the caller
452 makes the string ready before using it */
453 assert(_PyUnicode_CheckConsistency(unicode, 1));
454#endif
455 return unicode;
456}
457
458static PyObject*
459unicode_result_ready(PyObject *unicode)
460{
461 Py_ssize_t length;
462
463 length = PyUnicode_GET_LENGTH(unicode);
464 if (length == 0) {
465 if (unicode != unicode_empty) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100466 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200467 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100468 }
469 return unicode_empty;
470 }
471
472 if (length == 1) {
Victor Stinner69ed0f42013-04-09 21:48:24 +0200473 void *data = PyUnicode_DATA(unicode);
474 int kind = PyUnicode_KIND(unicode);
475 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100476 if (ch < 256) {
477 PyObject *latin1_char = unicode_latin1[ch];
478 if (latin1_char != NULL) {
479 if (unicode != latin1_char) {
480 Py_INCREF(latin1_char);
481 Py_DECREF(unicode);
482 }
483 return latin1_char;
484 }
485 else {
486 assert(_PyUnicode_CheckConsistency(unicode, 1));
487 Py_INCREF(unicode);
488 unicode_latin1[ch] = unicode;
489 return unicode;
490 }
491 }
492 }
493
494 assert(_PyUnicode_CheckConsistency(unicode, 1));
495 return unicode;
496}
497
498static PyObject*
499unicode_result(PyObject *unicode)
500{
501 assert(_PyUnicode_CHECK(unicode));
502 if (PyUnicode_IS_READY(unicode))
503 return unicode_result_ready(unicode);
504 else
505 return unicode_result_wchar(unicode);
506}
507
Victor Stinnerc4b49542011-12-11 22:44:26 +0100508static PyObject*
509unicode_result_unchanged(PyObject *unicode)
510{
511 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500512 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100513 return NULL;
514 Py_INCREF(unicode);
515 return unicode;
516 }
517 else
518 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100519 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100520}
521
Victor Stinner3a50e702011-10-18 21:21:00 +0200522#ifdef HAVE_MBCS
523static OSVERSIONINFOEX winver;
524#endif
525
Thomas Wouters477c8d52006-05-27 19:21:47 +0000526/* --- Bloom Filters ----------------------------------------------------- */
527
528/* stuff to implement simple "bloom filters" for Unicode characters.
529 to keep things simple, we use a single bitmask, using the least 5
530 bits from each unicode characters as the bit index. */
531
532/* the linebreak mask is set up by Unicode_Init below */
533
Antoine Pitrouf068f942010-01-13 14:19:12 +0000534#if LONG_BIT >= 128
535#define BLOOM_WIDTH 128
536#elif LONG_BIT >= 64
537#define BLOOM_WIDTH 64
538#elif LONG_BIT >= 32
539#define BLOOM_WIDTH 32
540#else
541#error "LONG_BIT is smaller than 32"
542#endif
543
Thomas Wouters477c8d52006-05-27 19:21:47 +0000544#define BLOOM_MASK unsigned long
545
Serhiy Storchaka05997252013-01-26 12:14:02 +0200546static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000547
Antoine Pitrouf068f942010-01-13 14:19:12 +0000548#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000549
Benjamin Peterson29060642009-01-31 22:14:21 +0000550#define BLOOM_LINEBREAK(ch) \
551 ((ch) < 128U ? ascii_linebreak[(ch)] : \
552 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000553
Alexander Belopolsky40018472011-02-26 01:02:56 +0000554Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200555make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000556{
Victor Stinnera85af502013-04-09 21:53:54 +0200557#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
558 do { \
559 TYPE *data = (TYPE *)PTR; \
560 TYPE *end = data + LEN; \
561 Py_UCS4 ch; \
562 for (; data != end; data++) { \
563 ch = *data; \
564 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
565 } \
566 break; \
567 } while (0)
568
Thomas Wouters477c8d52006-05-27 19:21:47 +0000569 /* calculate simple bloom-style bitmask for a given unicode string */
570
Antoine Pitrouf068f942010-01-13 14:19:12 +0000571 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000572
573 mask = 0;
Victor Stinnera85af502013-04-09 21:53:54 +0200574 switch (kind) {
575 case PyUnicode_1BYTE_KIND:
576 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
577 break;
578 case PyUnicode_2BYTE_KIND:
579 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
580 break;
581 case PyUnicode_4BYTE_KIND:
582 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
583 break;
584 default:
585 assert(0);
586 }
Thomas Wouters477c8d52006-05-27 19:21:47 +0000587 return mask;
Victor Stinnera85af502013-04-09 21:53:54 +0200588
589#undef BLOOM_UPDATE
Thomas Wouters477c8d52006-05-27 19:21:47 +0000590}
591
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200592/* Compilation of templated routines */
593
594#include "stringlib/asciilib.h"
595#include "stringlib/fastsearch.h"
596#include "stringlib/partition.h"
597#include "stringlib/split.h"
598#include "stringlib/count.h"
599#include "stringlib/find.h"
600#include "stringlib/find_max_char.h"
601#include "stringlib/localeutil.h"
602#include "stringlib/undef.h"
603
604#include "stringlib/ucs1lib.h"
605#include "stringlib/fastsearch.h"
606#include "stringlib/partition.h"
607#include "stringlib/split.h"
608#include "stringlib/count.h"
609#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300610#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200611#include "stringlib/find_max_char.h"
612#include "stringlib/localeutil.h"
613#include "stringlib/undef.h"
614
615#include "stringlib/ucs2lib.h"
616#include "stringlib/fastsearch.h"
617#include "stringlib/partition.h"
618#include "stringlib/split.h"
619#include "stringlib/count.h"
620#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300621#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200622#include "stringlib/find_max_char.h"
623#include "stringlib/localeutil.h"
624#include "stringlib/undef.h"
625
626#include "stringlib/ucs4lib.h"
627#include "stringlib/fastsearch.h"
628#include "stringlib/partition.h"
629#include "stringlib/split.h"
630#include "stringlib/count.h"
631#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300632#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200633#include "stringlib/find_max_char.h"
634#include "stringlib/localeutil.h"
635#include "stringlib/undef.h"
636
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200637#include "stringlib/unicodedefs.h"
638#include "stringlib/fastsearch.h"
639#include "stringlib/count.h"
640#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100641#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200642
Guido van Rossumd57fd912000-03-10 22:53:23 +0000643/* --- Unicode Object ----------------------------------------------------- */
644
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200645static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200646fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200647
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200648Py_LOCAL_INLINE(Py_ssize_t) findchar(void *s, int kind,
649 Py_ssize_t size, Py_UCS4 ch,
650 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200651{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200652 int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH;
653
654 switch (kind) {
655 case PyUnicode_1BYTE_KIND:
656 {
657 Py_UCS1 ch1 = (Py_UCS1) ch;
658 if (ch1 == ch)
659 return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode);
660 else
661 return -1;
662 }
663 case PyUnicode_2BYTE_KIND:
664 {
665 Py_UCS2 ch2 = (Py_UCS2) ch;
666 if (ch2 == ch)
667 return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode);
668 else
669 return -1;
670 }
671 case PyUnicode_4BYTE_KIND:
672 return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode);
673 default:
674 assert(0);
675 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200676 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200677}
678
Victor Stinnerafffce42012-10-03 23:03:17 +0200679#ifdef Py_DEBUG
680/* Fill the data of an Unicode string with invalid characters to detect bugs
681 earlier.
682
683 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
684 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
685 invalid character in Unicode 6.0. */
686static void
687unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
688{
689 int kind = PyUnicode_KIND(unicode);
690 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
691 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
692 if (length <= old_length)
693 return;
694 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
695}
696#endif
697
Victor Stinnerfe226c02011-10-03 03:52:20 +0200698static PyObject*
699resize_compact(PyObject *unicode, Py_ssize_t length)
700{
701 Py_ssize_t char_size;
702 Py_ssize_t struct_size;
703 Py_ssize_t new_size;
704 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100705 PyObject *new_unicode;
Victor Stinnerafffce42012-10-03 23:03:17 +0200706#ifdef Py_DEBUG
707 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
708#endif
709
Victor Stinner79891572012-05-03 13:43:07 +0200710 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200711 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +0100712 assert(PyUnicode_IS_COMPACT(unicode));
713
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200714 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100715 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +0200716 struct_size = sizeof(PyASCIIObject);
717 else
718 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200719 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200720
Victor Stinnerfe226c02011-10-03 03:52:20 +0200721 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
722 PyErr_NoMemory();
723 return NULL;
724 }
725 new_size = (struct_size + (length + 1) * char_size);
726
Victor Stinner84def372011-12-11 20:04:56 +0100727 _Py_DEC_REFTOTAL;
728 _Py_ForgetReference(unicode);
729
730 new_unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
731 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100732 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200733 PyErr_NoMemory();
734 return NULL;
735 }
Victor Stinner84def372011-12-11 20:04:56 +0100736 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200737 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +0100738
Victor Stinnerfe226c02011-10-03 03:52:20 +0200739 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200740 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200741 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100742 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +0200743 _PyUnicode_WSTR_LENGTH(unicode) = length;
744 }
Victor Stinnerbbbac2e2013-02-07 23:12:46 +0100745 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
746 PyObject_DEL(_PyUnicode_WSTR(unicode));
747 _PyUnicode_WSTR(unicode) = NULL;
748 }
Victor Stinnerafffce42012-10-03 23:03:17 +0200749#ifdef Py_DEBUG
750 unicode_fill_invalid(unicode, old_length);
751#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200752 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
753 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +0200754 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200755 return unicode;
756}
757
Alexander Belopolsky40018472011-02-26 01:02:56 +0000758static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200759resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000760{
Victor Stinner95663112011-10-04 01:03:50 +0200761 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100762 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200763 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200764 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000765
Victor Stinnerfe226c02011-10-03 03:52:20 +0200766 if (PyUnicode_IS_READY(unicode)) {
767 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200768 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200769 void *data;
Victor Stinnerafffce42012-10-03 23:03:17 +0200770#ifdef Py_DEBUG
771 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
772#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200773
774 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200775 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200776 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
777 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200778
779 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
780 PyErr_NoMemory();
781 return -1;
782 }
783 new_size = (length + 1) * char_size;
784
Victor Stinner7a9105a2011-12-12 00:13:42 +0100785 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
786 {
787 PyObject_DEL(_PyUnicode_UTF8(unicode));
788 _PyUnicode_UTF8(unicode) = NULL;
789 _PyUnicode_UTF8_LENGTH(unicode) = 0;
790 }
791
Victor Stinnerfe226c02011-10-03 03:52:20 +0200792 data = (PyObject *)PyObject_REALLOC(data, new_size);
793 if (data == NULL) {
794 PyErr_NoMemory();
795 return -1;
796 }
797 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200798 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200799 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200800 _PyUnicode_WSTR_LENGTH(unicode) = length;
801 }
802 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200803 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200804 _PyUnicode_UTF8_LENGTH(unicode) = length;
805 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200806 _PyUnicode_LENGTH(unicode) = length;
807 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinnerafffce42012-10-03 23:03:17 +0200808#ifdef Py_DEBUG
809 unicode_fill_invalid(unicode, old_length);
810#endif
Victor Stinner95663112011-10-04 01:03:50 +0200811 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200812 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200813 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200814 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200815 }
Victor Stinner95663112011-10-04 01:03:50 +0200816 assert(_PyUnicode_WSTR(unicode) != NULL);
817
818 /* check for integer overflow */
819 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
820 PyErr_NoMemory();
821 return -1;
822 }
Victor Stinner7a9105a2011-12-12 00:13:42 +0100823 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +0200824 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +0100825 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +0200826 if (!wstr) {
827 PyErr_NoMemory();
828 return -1;
829 }
830 _PyUnicode_WSTR(unicode) = wstr;
831 _PyUnicode_WSTR(unicode)[length] = 0;
832 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200833 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000834 return 0;
835}
836
Victor Stinnerfe226c02011-10-03 03:52:20 +0200837static PyObject*
838resize_copy(PyObject *unicode, Py_ssize_t length)
839{
840 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100841 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200842 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100843
Benjamin Petersonbac79492012-01-14 13:34:47 -0500844 if (PyUnicode_READY(unicode) == -1)
Victor Stinner7a9105a2011-12-12 00:13:42 +0100845 return NULL;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200846
847 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
848 if (copy == NULL)
849 return NULL;
850
851 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +0200852 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200853 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200854 }
855 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200856 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100857
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200858 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200859 if (w == NULL)
860 return NULL;
861 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
862 copy_length = Py_MIN(copy_length, length);
Victor Stinnerc6cf1ba2012-10-23 02:54:47 +0200863 Py_MEMCPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
864 copy_length * sizeof(wchar_t));
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200865 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200866 }
867}
868
Guido van Rossumd57fd912000-03-10 22:53:23 +0000869/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000870 Ux0000 terminated; some code (e.g. new_identifier)
871 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000872
873 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000874 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000875
876*/
877
Alexander Belopolsky40018472011-02-26 01:02:56 +0000878static PyUnicodeObject *
879_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000880{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +0200881 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200882 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000883
Thomas Wouters477c8d52006-05-27 19:21:47 +0000884 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000885 if (length == 0 && unicode_empty != NULL) {
886 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200887 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000888 }
889
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000890 /* Ensure we won't overflow the size. */
891 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
892 return (PyUnicodeObject *)PyErr_NoMemory();
893 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200894 if (length < 0) {
895 PyErr_SetString(PyExc_SystemError,
896 "Negative size passed to _PyUnicode_New");
897 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000898 }
899
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200900 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
901 if (unicode == NULL)
902 return NULL;
903 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
Victor Stinner68b674c2013-10-29 19:31:43 +0100904
905 _PyUnicode_WSTR_LENGTH(unicode) = length;
906 _PyUnicode_HASH(unicode) = -1;
907 _PyUnicode_STATE(unicode).interned = 0;
908 _PyUnicode_STATE(unicode).kind = 0;
909 _PyUnicode_STATE(unicode).compact = 0;
910 _PyUnicode_STATE(unicode).ready = 0;
911 _PyUnicode_STATE(unicode).ascii = 0;
912 _PyUnicode_DATA_ANY(unicode) = NULL;
913 _PyUnicode_LENGTH(unicode) = 0;
914 _PyUnicode_UTF8(unicode) = NULL;
915 _PyUnicode_UTF8_LENGTH(unicode) = 0;
916
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200917 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
918 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100919 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +0000920 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100921 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000922 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200923
Jeremy Hyltond8082792003-09-16 19:41:39 +0000924 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000925 * the caller fails before initializing str -- unicode_resize()
926 * reads str[0], and the Keep-Alive optimization can keep memory
927 * allocated for str alive across a call to unicode_dealloc(unicode).
928 * We don't want unicode_resize to read uninitialized memory in
929 * that case.
930 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200931 _PyUnicode_WSTR(unicode)[0] = 0;
932 _PyUnicode_WSTR(unicode)[length] = 0;
Victor Stinner68b674c2013-10-29 19:31:43 +0100933
Victor Stinner7931d9a2011-11-04 00:22:48 +0100934 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000935 return unicode;
936}
937
Victor Stinnerf42dc442011-10-02 23:33:16 +0200938static const char*
939unicode_kind_name(PyObject *unicode)
940{
Victor Stinner42dfd712011-10-03 14:41:45 +0200941 /* don't check consistency: unicode_kind_name() is called from
942 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +0200943 if (!PyUnicode_IS_COMPACT(unicode))
944 {
945 if (!PyUnicode_IS_READY(unicode))
946 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -0600947 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200948 {
949 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200950 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200951 return "legacy ascii";
952 else
953 return "legacy latin1";
954 case PyUnicode_2BYTE_KIND:
955 return "legacy UCS2";
956 case PyUnicode_4BYTE_KIND:
957 return "legacy UCS4";
958 default:
959 return "<legacy invalid kind>";
960 }
961 }
962 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -0600963 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +0200964 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200965 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200966 return "ascii";
967 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200968 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200969 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200970 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200971 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200972 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200973 default:
974 return "<invalid compact kind>";
975 }
976}
977
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200978#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200979/* Functions wrapping macros for use in debugger */
980char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200981 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200982}
983
984void *_PyUnicode_compact_data(void *unicode) {
985 return _PyUnicode_COMPACT_DATA(unicode);
986}
987void *_PyUnicode_data(void *unicode){
988 printf("obj %p\n", unicode);
989 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
990 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
991 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
992 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
993 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
994 return PyUnicode_DATA(unicode);
995}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200996
997void
998_PyUnicode_Dump(PyObject *op)
999{
1000 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +02001001 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1002 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
1003 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +02001004
Victor Stinnera849a4b2011-10-03 12:12:11 +02001005 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +02001006 {
1007 if (ascii->state.ascii)
1008 data = (ascii + 1);
1009 else
1010 data = (compact + 1);
1011 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001012 else
1013 data = unicode->data.any;
Victor Stinner0d60e872011-10-23 19:47:19 +02001014 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
1015
Victor Stinnera849a4b2011-10-03 12:12:11 +02001016 if (ascii->wstr == data)
1017 printf("shared ");
1018 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +02001019
Victor Stinnera3b334d2011-10-03 13:53:37 +02001020 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnera849a4b2011-10-03 12:12:11 +02001021 printf(" (%zu), ", compact->wstr_length);
1022 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
1023 printf("shared ");
1024 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001025 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001026 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001027}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001028#endif
1029
1030PyObject *
1031PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1032{
1033 PyObject *obj;
1034 PyCompactUnicodeObject *unicode;
1035 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +02001036 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001037 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001038 Py_ssize_t char_size;
1039 Py_ssize_t struct_size;
1040
1041 /* Optimization for empty strings */
1042 if (size == 0 && unicode_empty != NULL) {
1043 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001044 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001045 }
1046
Victor Stinner9e9d6892011-10-04 01:02:02 +02001047 is_ascii = 0;
1048 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001049 struct_size = sizeof(PyCompactUnicodeObject);
1050 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +02001051 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001052 char_size = 1;
1053 is_ascii = 1;
1054 struct_size = sizeof(PyASCIIObject);
1055 }
1056 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001057 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001058 char_size = 1;
1059 }
1060 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001061 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001062 char_size = 2;
1063 if (sizeof(wchar_t) == 2)
1064 is_sharing = 1;
1065 }
1066 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001067 if (maxchar > MAX_UNICODE) {
1068 PyErr_SetString(PyExc_SystemError,
1069 "invalid maximum character passed to PyUnicode_New");
1070 return NULL;
1071 }
Victor Stinner8f825062012-04-27 13:55:39 +02001072 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001073 char_size = 4;
1074 if (sizeof(wchar_t) == 4)
1075 is_sharing = 1;
1076 }
1077
1078 /* Ensure we won't overflow the size. */
1079 if (size < 0) {
1080 PyErr_SetString(PyExc_SystemError,
1081 "Negative size passed to PyUnicode_New");
1082 return NULL;
1083 }
1084 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1085 return PyErr_NoMemory();
1086
1087 /* Duplicated allocation code from _PyObject_New() instead of a call to
1088 * PyObject_New() so we are able to allocate space for the object and
1089 * it's data buffer.
1090 */
1091 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1092 if (obj == NULL)
1093 return PyErr_NoMemory();
1094 obj = PyObject_INIT(obj, &PyUnicode_Type);
1095 if (obj == NULL)
1096 return NULL;
1097
1098 unicode = (PyCompactUnicodeObject *)obj;
1099 if (is_ascii)
1100 data = ((PyASCIIObject*)obj) + 1;
1101 else
1102 data = unicode + 1;
1103 _PyUnicode_LENGTH(unicode) = size;
1104 _PyUnicode_HASH(unicode) = -1;
1105 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001106 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001107 _PyUnicode_STATE(unicode).compact = 1;
1108 _PyUnicode_STATE(unicode).ready = 1;
1109 _PyUnicode_STATE(unicode).ascii = is_ascii;
1110 if (is_ascii) {
1111 ((char*)data)[size] = 0;
1112 _PyUnicode_WSTR(unicode) = NULL;
1113 }
Victor Stinner8f825062012-04-27 13:55:39 +02001114 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001115 ((char*)data)[size] = 0;
1116 _PyUnicode_WSTR(unicode) = NULL;
1117 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001118 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001119 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001120 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001121 else {
1122 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001123 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001124 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001125 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001126 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001127 ((Py_UCS4*)data)[size] = 0;
1128 if (is_sharing) {
1129 _PyUnicode_WSTR_LENGTH(unicode) = size;
1130 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1131 }
1132 else {
1133 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1134 _PyUnicode_WSTR(unicode) = NULL;
1135 }
1136 }
Victor Stinner8f825062012-04-27 13:55:39 +02001137#ifdef Py_DEBUG
Victor Stinnerafffce42012-10-03 23:03:17 +02001138 unicode_fill_invalid((PyObject*)unicode, 0);
Victor Stinner8f825062012-04-27 13:55:39 +02001139#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001140 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001141 return obj;
1142}
1143
1144#if SIZEOF_WCHAR_T == 2
1145/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1146 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001147 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001148
1149 This function assumes that unicode can hold one more code point than wstr
1150 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001151static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001152unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001153 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001154{
1155 const wchar_t *iter;
1156 Py_UCS4 *ucs4_out;
1157
Victor Stinner910337b2011-10-03 03:20:16 +02001158 assert(unicode != NULL);
1159 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001160 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1161 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1162
1163 for (iter = begin; iter < end; ) {
1164 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1165 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001166 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1167 && (iter+1) < end
1168 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001169 {
Victor Stinner551ac952011-11-29 22:58:13 +01001170 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001171 iter += 2;
1172 }
1173 else {
1174 *ucs4_out++ = *iter;
1175 iter++;
1176 }
1177 }
1178 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1179 _PyUnicode_GET_LENGTH(unicode)));
1180
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001181}
1182#endif
1183
Victor Stinnercd9950f2011-10-02 00:34:53 +02001184static int
Victor Stinner488fa492011-12-12 00:01:39 +01001185unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001186{
Victor Stinner488fa492011-12-12 00:01:39 +01001187 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001188 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001189 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001190 return -1;
1191 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001192 return 0;
1193}
1194
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001195static int
1196_copy_characters(PyObject *to, Py_ssize_t to_start,
1197 PyObject *from, Py_ssize_t from_start,
1198 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001199{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001200 unsigned int from_kind, to_kind;
1201 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001202
Victor Stinneree4544c2012-05-09 22:24:08 +02001203 assert(0 <= how_many);
1204 assert(0 <= from_start);
1205 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001206 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001207 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001208 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001209
Victor Stinnerd3f08822012-05-29 12:57:52 +02001210 assert(PyUnicode_Check(to));
1211 assert(PyUnicode_IS_READY(to));
1212 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1213
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001214 if (how_many == 0)
1215 return 0;
1216
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001217 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001218 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001219 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001220 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001221
Victor Stinnerf1852262012-06-16 16:38:26 +02001222#ifdef Py_DEBUG
1223 if (!check_maxchar
1224 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1225 {
1226 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1227 Py_UCS4 ch;
1228 Py_ssize_t i;
1229 for (i=0; i < how_many; i++) {
1230 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1231 assert(ch <= to_maxchar);
1232 }
1233 }
1234#endif
1235
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001236 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001237 if (check_maxchar
1238 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1239 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001240 /* Writing Latin-1 characters into an ASCII string requires to
1241 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001242 Py_UCS4 max_char;
1243 max_char = ucs1lib_find_max_char(from_data,
1244 (Py_UCS1*)from_data + how_many);
1245 if (max_char >= 128)
1246 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001247 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001248 Py_MEMCPY((char*)to_data + to_kind * to_start,
1249 (char*)from_data + from_kind * from_start,
1250 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001251 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001252 else if (from_kind == PyUnicode_1BYTE_KIND
1253 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001254 {
1255 _PyUnicode_CONVERT_BYTES(
1256 Py_UCS1, Py_UCS2,
1257 PyUnicode_1BYTE_DATA(from) + from_start,
1258 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1259 PyUnicode_2BYTE_DATA(to) + to_start
1260 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001261 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001262 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001263 && to_kind == PyUnicode_4BYTE_KIND)
1264 {
1265 _PyUnicode_CONVERT_BYTES(
1266 Py_UCS1, Py_UCS4,
1267 PyUnicode_1BYTE_DATA(from) + from_start,
1268 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1269 PyUnicode_4BYTE_DATA(to) + to_start
1270 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001271 }
1272 else if (from_kind == PyUnicode_2BYTE_KIND
1273 && to_kind == PyUnicode_4BYTE_KIND)
1274 {
1275 _PyUnicode_CONVERT_BYTES(
1276 Py_UCS2, Py_UCS4,
1277 PyUnicode_2BYTE_DATA(from) + from_start,
1278 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1279 PyUnicode_4BYTE_DATA(to) + to_start
1280 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001281 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001282 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001283 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1284
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001285 if (!check_maxchar) {
1286 if (from_kind == PyUnicode_2BYTE_KIND
1287 && to_kind == PyUnicode_1BYTE_KIND)
1288 {
1289 _PyUnicode_CONVERT_BYTES(
1290 Py_UCS2, Py_UCS1,
1291 PyUnicode_2BYTE_DATA(from) + from_start,
1292 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1293 PyUnicode_1BYTE_DATA(to) + to_start
1294 );
1295 }
1296 else if (from_kind == PyUnicode_4BYTE_KIND
1297 && to_kind == PyUnicode_1BYTE_KIND)
1298 {
1299 _PyUnicode_CONVERT_BYTES(
1300 Py_UCS4, Py_UCS1,
1301 PyUnicode_4BYTE_DATA(from) + from_start,
1302 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1303 PyUnicode_1BYTE_DATA(to) + to_start
1304 );
1305 }
1306 else if (from_kind == PyUnicode_4BYTE_KIND
1307 && to_kind == PyUnicode_2BYTE_KIND)
1308 {
1309 _PyUnicode_CONVERT_BYTES(
1310 Py_UCS4, Py_UCS2,
1311 PyUnicode_4BYTE_DATA(from) + from_start,
1312 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1313 PyUnicode_2BYTE_DATA(to) + to_start
1314 );
1315 }
1316 else {
1317 assert(0);
1318 return -1;
1319 }
1320 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001321 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001322 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001323 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001324 Py_ssize_t i;
1325
Victor Stinnera0702ab2011-09-29 14:14:38 +02001326 for (i=0; i < how_many; i++) {
1327 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001328 if (ch > to_maxchar)
1329 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001330 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1331 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001332 }
1333 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001334 return 0;
1335}
1336
Victor Stinnerd3f08822012-05-29 12:57:52 +02001337void
1338_PyUnicode_FastCopyCharacters(
1339 PyObject *to, Py_ssize_t to_start,
1340 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001341{
1342 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1343}
1344
1345Py_ssize_t
1346PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1347 PyObject *from, Py_ssize_t from_start,
1348 Py_ssize_t how_many)
1349{
1350 int err;
1351
1352 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1353 PyErr_BadInternalCall();
1354 return -1;
1355 }
1356
Benjamin Petersonbac79492012-01-14 13:34:47 -05001357 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001358 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001359 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001360 return -1;
1361
Victor Stinnerd3f08822012-05-29 12:57:52 +02001362 if (from_start < 0) {
1363 PyErr_SetString(PyExc_IndexError, "string index out of range");
1364 return -1;
1365 }
1366 if (to_start < 0) {
1367 PyErr_SetString(PyExc_IndexError, "string index out of range");
1368 return -1;
1369 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001370 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1371 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1372 PyErr_Format(PyExc_SystemError,
1373 "Cannot write %zi characters at %zi "
1374 "in a string of %zi characters",
1375 how_many, to_start, PyUnicode_GET_LENGTH(to));
1376 return -1;
1377 }
1378
1379 if (how_many == 0)
1380 return 0;
1381
Victor Stinner488fa492011-12-12 00:01:39 +01001382 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001383 return -1;
1384
1385 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1386 if (err) {
1387 PyErr_Format(PyExc_SystemError,
1388 "Cannot copy %s characters "
1389 "into a string of %s characters",
1390 unicode_kind_name(from),
1391 unicode_kind_name(to));
1392 return -1;
1393 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001394 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001395}
1396
Victor Stinner17222162011-09-28 22:15:37 +02001397/* Find the maximum code point and count the number of surrogate pairs so a
1398 correct string length can be computed before converting a string to UCS4.
1399 This function counts single surrogates as a character and not as a pair.
1400
1401 Return 0 on success, or -1 on error. */
1402static int
1403find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1404 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001405{
1406 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001407 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001408
Victor Stinnerc53be962011-10-02 21:33:54 +02001409 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001410 *num_surrogates = 0;
1411 *maxchar = 0;
1412
1413 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001414#if SIZEOF_WCHAR_T == 2
Victor Stinnercf77da92013-03-06 01:09:24 +01001415 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1416 && (iter+1) < end
1417 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1418 {
1419 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1420 ++(*num_surrogates);
1421 iter += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001422 }
1423 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001424#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001425 {
1426 ch = *iter;
1427 iter++;
1428 }
1429 if (ch > *maxchar) {
1430 *maxchar = ch;
1431 if (*maxchar > MAX_UNICODE) {
1432 PyErr_Format(PyExc_ValueError,
1433 "character U+%x is not in range [U+0000; U+10ffff]",
1434 ch);
1435 return -1;
1436 }
1437 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001438 }
1439 return 0;
1440}
1441
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001442int
1443_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001444{
1445 wchar_t *end;
1446 Py_UCS4 maxchar = 0;
1447 Py_ssize_t num_surrogates;
1448#if SIZEOF_WCHAR_T == 2
1449 Py_ssize_t length_wo_surrogates;
1450#endif
1451
Georg Brandl7597add2011-10-05 16:36:47 +02001452 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001453 strings were created using _PyObject_New() and where no canonical
1454 representation (the str field) has been set yet aka strings
1455 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001456 assert(_PyUnicode_CHECK(unicode));
1457 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001458 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001459 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001460 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001461 /* Actually, it should neither be interned nor be anything else: */
1462 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001463
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001464 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001465 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001466 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001467 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001468
1469 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001470 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1471 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001472 PyErr_NoMemory();
1473 return -1;
1474 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001475 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001476 _PyUnicode_WSTR(unicode), end,
1477 PyUnicode_1BYTE_DATA(unicode));
1478 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1479 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1480 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1481 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001482 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001483 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001484 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001485 }
1486 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001487 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001488 _PyUnicode_UTF8(unicode) = NULL;
1489 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001490 }
1491 PyObject_FREE(_PyUnicode_WSTR(unicode));
1492 _PyUnicode_WSTR(unicode) = NULL;
1493 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1494 }
1495 /* In this case we might have to convert down from 4-byte native
1496 wchar_t to 2-byte unicode. */
1497 else if (maxchar < 65536) {
1498 assert(num_surrogates == 0 &&
1499 "FindMaxCharAndNumSurrogatePairs() messed up");
1500
Victor Stinner506f5922011-09-28 22:34:18 +02001501#if SIZEOF_WCHAR_T == 2
1502 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001503 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001504 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1505 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1506 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001507 _PyUnicode_UTF8(unicode) = NULL;
1508 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001509#else
1510 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001511 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001512 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001513 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001514 PyErr_NoMemory();
1515 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001516 }
Victor Stinner506f5922011-09-28 22:34:18 +02001517 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1518 _PyUnicode_WSTR(unicode), end,
1519 PyUnicode_2BYTE_DATA(unicode));
1520 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1521 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1522 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001523 _PyUnicode_UTF8(unicode) = NULL;
1524 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001525 PyObject_FREE(_PyUnicode_WSTR(unicode));
1526 _PyUnicode_WSTR(unicode) = NULL;
1527 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1528#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001529 }
1530 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1531 else {
1532#if SIZEOF_WCHAR_T == 2
1533 /* in case the native representation is 2-bytes, we need to allocate a
1534 new normalized 4-byte version. */
1535 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001536 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1537 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001538 PyErr_NoMemory();
1539 return -1;
1540 }
1541 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1542 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001543 _PyUnicode_UTF8(unicode) = NULL;
1544 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001545 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1546 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001547 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001548 PyObject_FREE(_PyUnicode_WSTR(unicode));
1549 _PyUnicode_WSTR(unicode) = NULL;
1550 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1551#else
1552 assert(num_surrogates == 0);
1553
Victor Stinnerc3c74152011-10-02 20:39:55 +02001554 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001555 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001556 _PyUnicode_UTF8(unicode) = NULL;
1557 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001558 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1559#endif
1560 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1561 }
1562 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001563 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001564 return 0;
1565}
1566
Alexander Belopolsky40018472011-02-26 01:02:56 +00001567static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001568unicode_dealloc(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001569{
Walter Dörwald16807132007-05-25 13:52:07 +00001570 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001571 case SSTATE_NOT_INTERNED:
1572 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001573
Benjamin Peterson29060642009-01-31 22:14:21 +00001574 case SSTATE_INTERNED_MORTAL:
1575 /* revive dead object temporarily for DelItem */
1576 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001577 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001578 Py_FatalError(
1579 "deletion of interned string failed");
1580 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001581
Benjamin Peterson29060642009-01-31 22:14:21 +00001582 case SSTATE_INTERNED_IMMORTAL:
1583 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001584
Benjamin Peterson29060642009-01-31 22:14:21 +00001585 default:
1586 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001587 }
1588
Victor Stinner03490912011-10-03 23:45:12 +02001589 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001590 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001591 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001592 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001593 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1594 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001595
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001596 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001597}
1598
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001599#ifdef Py_DEBUG
1600static int
1601unicode_is_singleton(PyObject *unicode)
1602{
1603 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1604 if (unicode == unicode_empty)
1605 return 1;
1606 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1607 {
1608 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1609 if (ch < 256 && unicode_latin1[ch] == unicode)
1610 return 1;
1611 }
1612 return 0;
1613}
1614#endif
1615
Alexander Belopolsky40018472011-02-26 01:02:56 +00001616static int
Victor Stinner488fa492011-12-12 00:01:39 +01001617unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001618{
Victor Stinner488fa492011-12-12 00:01:39 +01001619 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001620 if (Py_REFCNT(unicode) != 1)
1621 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001622 if (_PyUnicode_HASH(unicode) != -1)
1623 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001624 if (PyUnicode_CHECK_INTERNED(unicode))
1625 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001626 if (!PyUnicode_CheckExact(unicode))
1627 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001628#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001629 /* singleton refcount is greater than 1 */
1630 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001631#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001632 return 1;
1633}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001634
Victor Stinnerfe226c02011-10-03 03:52:20 +02001635static int
1636unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1637{
1638 PyObject *unicode;
1639 Py_ssize_t old_length;
1640
1641 assert(p_unicode != NULL);
1642 unicode = *p_unicode;
1643
1644 assert(unicode != NULL);
1645 assert(PyUnicode_Check(unicode));
1646 assert(0 <= length);
1647
Victor Stinner910337b2011-10-03 03:20:16 +02001648 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001649 old_length = PyUnicode_WSTR_LENGTH(unicode);
1650 else
1651 old_length = PyUnicode_GET_LENGTH(unicode);
1652 if (old_length == length)
1653 return 0;
1654
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001655 if (length == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02001656 _Py_INCREF_UNICODE_EMPTY();
1657 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00001658 return -1;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001659 Py_DECREF(*p_unicode);
1660 *p_unicode = unicode_empty;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001661 return 0;
1662 }
1663
Victor Stinner488fa492011-12-12 00:01:39 +01001664 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001665 PyObject *copy = resize_copy(unicode, length);
1666 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001667 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001668 Py_DECREF(*p_unicode);
1669 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001670 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001671 }
1672
Victor Stinnerfe226c02011-10-03 03:52:20 +02001673 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001674 PyObject *new_unicode = resize_compact(unicode, length);
1675 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001676 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001677 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001678 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001679 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001680 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001681}
1682
Alexander Belopolsky40018472011-02-26 01:02:56 +00001683int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001684PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001685{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001686 PyObject *unicode;
1687 if (p_unicode == NULL) {
1688 PyErr_BadInternalCall();
1689 return -1;
1690 }
1691 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001692 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001693 {
1694 PyErr_BadInternalCall();
1695 return -1;
1696 }
1697 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001698}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001699
Victor Stinnerc5166102012-02-22 13:55:02 +01001700/* Copy a ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01001701
Victor Stinnerb429d3b2012-02-22 21:22:20 +01001702 WARNING: The function doesn't copy the terminating null character and
1703 doesn't check the maximum character (may write a latin1 character in an
1704 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02001705static void
1706unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1707 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01001708{
1709 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1710 void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02001711 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01001712
1713 switch (kind) {
1714 case PyUnicode_1BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01001715 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner8c6db452012-10-06 00:40:45 +02001716#ifdef Py_DEBUG
1717 if (PyUnicode_IS_ASCII(unicode)) {
1718 Py_UCS4 maxchar = ucs1lib_find_max_char(
1719 (const Py_UCS1*)str,
1720 (const Py_UCS1*)str + len);
1721 assert(maxchar < 128);
1722 }
1723#endif
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01001724 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02001725 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001726 }
1727 case PyUnicode_2BYTE_KIND: {
1728 Py_UCS2 *start = (Py_UCS2 *)data + index;
1729 Py_UCS2 *ucs2 = start;
1730 assert(index <= PyUnicode_GET_LENGTH(unicode));
1731
Victor Stinner184252a2012-06-16 02:57:41 +02001732 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001733 *ucs2 = (Py_UCS2)*str;
1734
1735 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02001736 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001737 }
1738 default: {
1739 Py_UCS4 *start = (Py_UCS4 *)data + index;
1740 Py_UCS4 *ucs4 = start;
1741 assert(kind == PyUnicode_4BYTE_KIND);
1742 assert(index <= PyUnicode_GET_LENGTH(unicode));
1743
Victor Stinner184252a2012-06-16 02:57:41 +02001744 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001745 *ucs4 = (Py_UCS4)*str;
1746
1747 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01001748 }
1749 }
1750}
1751
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001752static PyObject*
1753get_latin1_char(unsigned char ch)
1754{
Victor Stinnera464fc12011-10-02 20:39:30 +02001755 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001756 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001757 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001758 if (!unicode)
1759 return NULL;
1760 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001761 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001762 unicode_latin1[ch] = unicode;
1763 }
1764 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001765 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001766}
1767
Victor Stinner985a82a2014-01-03 12:53:47 +01001768static PyObject*
1769unicode_char(Py_UCS4 ch)
1770{
1771 PyObject *unicode;
1772
1773 assert(ch <= MAX_UNICODE);
1774
Victor Stinnerf3b46b42014-01-03 13:16:00 +01001775 if (ch < 256)
1776 return get_latin1_char(ch);
1777
Victor Stinner985a82a2014-01-03 12:53:47 +01001778 unicode = PyUnicode_New(1, ch);
1779 if (unicode == NULL)
1780 return NULL;
1781 switch (PyUnicode_KIND(unicode)) {
1782 case PyUnicode_1BYTE_KIND:
1783 PyUnicode_1BYTE_DATA(unicode)[0] = (Py_UCS1)ch;
1784 break;
1785 case PyUnicode_2BYTE_KIND:
1786 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
1787 break;
1788 default:
1789 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1790 PyUnicode_4BYTE_DATA(unicode)[0] = ch;
1791 }
1792 assert(_PyUnicode_CheckConsistency(unicode, 1));
1793 return unicode;
1794}
1795
Alexander Belopolsky40018472011-02-26 01:02:56 +00001796PyObject *
1797PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001798{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001799 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001800 Py_UCS4 maxchar = 0;
1801 Py_ssize_t num_surrogates;
1802
1803 if (u == NULL)
1804 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001805
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001806 /* If the Unicode data is known at construction time, we can apply
1807 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001808
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001809 /* Optimization for empty strings */
Serhiy Storchaka678db842013-01-26 12:16:36 +02001810 if (size == 0)
1811 _Py_RETURN_UNICODE_EMPTY();
Tim Petersced69f82003-09-16 20:30:58 +00001812
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001813 /* Single character Unicode objects in the Latin-1 range are
1814 shared when using this constructor */
Victor Stinnerd21b58c2013-02-26 00:15:54 +01001815 if (size == 1 && (Py_UCS4)*u < 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001816 return get_latin1_char((unsigned char)*u);
1817
1818 /* If not empty and not single character, copy the Unicode data
1819 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001820 if (find_maxchar_surrogates(u, u + size,
1821 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001822 return NULL;
1823
Victor Stinner8faf8212011-12-08 22:14:11 +01001824 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001825 if (!unicode)
1826 return NULL;
1827
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001828 switch (PyUnicode_KIND(unicode)) {
1829 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001830 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001831 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1832 break;
1833 case PyUnicode_2BYTE_KIND:
1834#if Py_UNICODE_SIZE == 2
1835 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1836#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001837 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001838 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1839#endif
1840 break;
1841 case PyUnicode_4BYTE_KIND:
1842#if SIZEOF_WCHAR_T == 2
1843 /* This is the only case which has to process surrogates, thus
1844 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001845 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001846#else
1847 assert(num_surrogates == 0);
1848 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1849#endif
1850 break;
1851 default:
1852 assert(0 && "Impossible state");
1853 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001854
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001855 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001856}
1857
Alexander Belopolsky40018472011-02-26 01:02:56 +00001858PyObject *
1859PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001860{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001861 if (size < 0) {
1862 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001863 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001864 return NULL;
1865 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001866 if (u != NULL)
1867 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
1868 else
1869 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001870}
1871
Alexander Belopolsky40018472011-02-26 01:02:56 +00001872PyObject *
1873PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001874{
1875 size_t size = strlen(u);
1876 if (size > PY_SSIZE_T_MAX) {
1877 PyErr_SetString(PyExc_OverflowError, "input too long");
1878 return NULL;
1879 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001880 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001881}
1882
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001883PyObject *
1884_PyUnicode_FromId(_Py_Identifier *id)
1885{
1886 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01001887 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
1888 strlen(id->string),
1889 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001890 if (!id->object)
1891 return NULL;
1892 PyUnicode_InternInPlace(&id->object);
1893 assert(!id->next);
1894 id->next = static_strings;
1895 static_strings = id;
1896 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001897 return id->object;
1898}
1899
1900void
1901_PyUnicode_ClearStaticStrings()
1902{
Benjamin Peterson0c270a82013-01-09 09:52:01 -06001903 _Py_Identifier *tmp, *s = static_strings;
1904 while (s) {
Serhiy Storchaka505ff752014-02-09 13:33:53 +02001905 Py_CLEAR(s->object);
Benjamin Peterson0c270a82013-01-09 09:52:01 -06001906 tmp = s->next;
1907 s->next = NULL;
1908 s = tmp;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001909 }
Benjamin Peterson0c270a82013-01-09 09:52:01 -06001910 static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001911}
1912
Benjamin Peterson0df54292012-03-26 14:50:32 -04001913/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001914
Victor Stinnerd3f08822012-05-29 12:57:52 +02001915PyObject*
1916_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02001917{
Victor Stinnerd3f08822012-05-29 12:57:52 +02001918 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01001919 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01001920 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02001921#ifdef Py_DEBUG
Victor Stinnerd21b58c2013-02-26 00:15:54 +01001922 assert((unsigned char)s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02001923#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001924 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01001925 }
Victor Stinner785938e2011-12-11 20:09:03 +01001926 unicode = PyUnicode_New(size, 127);
1927 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02001928 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01001929 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
1930 assert(_PyUnicode_CheckConsistency(unicode, 1));
1931 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02001932}
1933
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001934static Py_UCS4
1935kind_maxchar_limit(unsigned int kind)
1936{
Benjamin Petersonead6b532011-12-20 17:23:42 -06001937 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001938 case PyUnicode_1BYTE_KIND:
1939 return 0x80;
1940 case PyUnicode_2BYTE_KIND:
1941 return 0x100;
1942 case PyUnicode_4BYTE_KIND:
1943 return 0x10000;
1944 default:
1945 assert(0 && "invalid kind");
Victor Stinner8faf8212011-12-08 22:14:11 +01001946 return MAX_UNICODE;
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001947 }
1948}
1949
Victor Stinnere6abb482012-05-02 01:15:40 +02001950Py_LOCAL_INLINE(Py_UCS4)
1951align_maxchar(Py_UCS4 maxchar)
1952{
1953 if (maxchar <= 127)
1954 return 127;
1955 else if (maxchar <= 255)
1956 return 255;
1957 else if (maxchar <= 65535)
1958 return 65535;
1959 else
1960 return MAX_UNICODE;
1961}
1962
Victor Stinner702c7342011-10-05 13:50:52 +02001963static PyObject*
Victor Stinnerd21b58c2013-02-26 00:15:54 +01001964_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001965{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001966 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001967 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001968
Serhiy Storchaka678db842013-01-26 12:16:36 +02001969 if (size == 0)
1970 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001971 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001972 if (size == 1)
1973 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001974
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001975 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001976 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001977 if (!res)
1978 return NULL;
1979 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001980 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001981 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001982}
1983
Victor Stinnere57b1c02011-09-28 22:20:48 +02001984static PyObject*
1985_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001986{
1987 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001988 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001989
Serhiy Storchaka678db842013-01-26 12:16:36 +02001990 if (size == 0)
1991 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001992 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01001993 if (size == 1)
1994 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001995
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001996 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001997 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001998 if (!res)
1999 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002000 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002001 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002002 else {
2003 _PyUnicode_CONVERT_BYTES(
2004 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2005 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002006 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002007 return res;
2008}
2009
Victor Stinnere57b1c02011-09-28 22:20:48 +02002010static PyObject*
2011_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002012{
2013 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002014 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002015
Serhiy Storchaka678db842013-01-26 12:16:36 +02002016 if (size == 0)
2017 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002018 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002019 if (size == 1)
2020 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002021
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002022 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002023 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002024 if (!res)
2025 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002026 if (max_char < 256)
2027 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2028 PyUnicode_1BYTE_DATA(res));
2029 else if (max_char < 0x10000)
2030 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2031 PyUnicode_2BYTE_DATA(res));
2032 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002033 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002034 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002035 return res;
2036}
2037
2038PyObject*
2039PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2040{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002041 if (size < 0) {
2042 PyErr_SetString(PyExc_ValueError, "size must be positive");
2043 return NULL;
2044 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002045 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002046 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002047 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002048 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002049 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002050 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002051 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002052 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002053 PyErr_SetString(PyExc_SystemError, "invalid kind");
2054 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002055 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002056}
2057
Victor Stinnerece58de2012-04-23 23:36:38 +02002058Py_UCS4
2059_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2060{
2061 enum PyUnicode_Kind kind;
2062 void *startptr, *endptr;
2063
2064 assert(PyUnicode_IS_READY(unicode));
2065 assert(0 <= start);
2066 assert(end <= PyUnicode_GET_LENGTH(unicode));
2067 assert(start <= end);
2068
2069 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2070 return PyUnicode_MAX_CHAR_VALUE(unicode);
2071
2072 if (start == end)
2073 return 127;
2074
Victor Stinner94d558b2012-04-27 22:26:58 +02002075 if (PyUnicode_IS_ASCII(unicode))
2076 return 127;
2077
Victor Stinnerece58de2012-04-23 23:36:38 +02002078 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002079 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002080 endptr = (char *)startptr + end * kind;
2081 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002082 switch(kind) {
2083 case PyUnicode_1BYTE_KIND:
2084 return ucs1lib_find_max_char(startptr, endptr);
2085 case PyUnicode_2BYTE_KIND:
2086 return ucs2lib_find_max_char(startptr, endptr);
2087 case PyUnicode_4BYTE_KIND:
2088 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002089 default:
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002090 assert(0);
2091 return 0;
Victor Stinnerece58de2012-04-23 23:36:38 +02002092 }
2093}
2094
Victor Stinner25a4b292011-10-06 12:31:55 +02002095/* Ensure that a string uses the most efficient storage, if it is not the
2096 case: create a new string with of the right kind. Write NULL into *p_unicode
2097 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002098static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002099unicode_adjust_maxchar(PyObject **p_unicode)
2100{
2101 PyObject *unicode, *copy;
2102 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002103 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002104 unsigned int kind;
2105
2106 assert(p_unicode != NULL);
2107 unicode = *p_unicode;
2108 assert(PyUnicode_IS_READY(unicode));
2109 if (PyUnicode_IS_ASCII(unicode))
2110 return;
2111
2112 len = PyUnicode_GET_LENGTH(unicode);
2113 kind = PyUnicode_KIND(unicode);
2114 if (kind == PyUnicode_1BYTE_KIND) {
2115 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002116 max_char = ucs1lib_find_max_char(u, u + len);
2117 if (max_char >= 128)
2118 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002119 }
2120 else if (kind == PyUnicode_2BYTE_KIND) {
2121 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002122 max_char = ucs2lib_find_max_char(u, u + len);
2123 if (max_char >= 256)
2124 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002125 }
2126 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002127 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02002128 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002129 max_char = ucs4lib_find_max_char(u, u + len);
2130 if (max_char >= 0x10000)
2131 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002132 }
Victor Stinner25a4b292011-10-06 12:31:55 +02002133 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002134 if (copy != NULL)
2135 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002136 Py_DECREF(unicode);
2137 *p_unicode = copy;
2138}
2139
Victor Stinner034f6cf2011-09-30 02:26:44 +02002140PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002141_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002142{
Victor Stinner87af4f22011-11-21 23:03:47 +01002143 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002144 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002145
Victor Stinner034f6cf2011-09-30 02:26:44 +02002146 if (!PyUnicode_Check(unicode)) {
2147 PyErr_BadInternalCall();
2148 return NULL;
2149 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002150 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002151 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002152
Victor Stinner87af4f22011-11-21 23:03:47 +01002153 length = PyUnicode_GET_LENGTH(unicode);
2154 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002155 if (!copy)
2156 return NULL;
2157 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2158
Victor Stinner87af4f22011-11-21 23:03:47 +01002159 Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2160 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002161 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002162 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002163}
2164
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002165
Victor Stinnerbc603d12011-10-02 01:00:40 +02002166/* Widen Unicode objects to larger buffers. Don't write terminating null
2167 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002168
2169void*
2170_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2171{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002172 Py_ssize_t len;
2173 void *result;
2174 unsigned int skind;
2175
Benjamin Petersonbac79492012-01-14 13:34:47 -05002176 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02002177 return NULL;
2178
2179 len = PyUnicode_GET_LENGTH(s);
2180 skind = PyUnicode_KIND(s);
2181 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002182 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002183 return NULL;
2184 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002185 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002186 case PyUnicode_2BYTE_KIND:
2187 result = PyMem_Malloc(len * sizeof(Py_UCS2));
2188 if (!result)
2189 return PyErr_NoMemory();
2190 assert(skind == PyUnicode_1BYTE_KIND);
2191 _PyUnicode_CONVERT_BYTES(
2192 Py_UCS1, Py_UCS2,
2193 PyUnicode_1BYTE_DATA(s),
2194 PyUnicode_1BYTE_DATA(s) + len,
2195 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002196 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002197 case PyUnicode_4BYTE_KIND:
2198 result = PyMem_Malloc(len * sizeof(Py_UCS4));
2199 if (!result)
2200 return PyErr_NoMemory();
2201 if (skind == PyUnicode_2BYTE_KIND) {
2202 _PyUnicode_CONVERT_BYTES(
2203 Py_UCS2, Py_UCS4,
2204 PyUnicode_2BYTE_DATA(s),
2205 PyUnicode_2BYTE_DATA(s) + len,
2206 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002207 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002208 else {
2209 assert(skind == PyUnicode_1BYTE_KIND);
2210 _PyUnicode_CONVERT_BYTES(
2211 Py_UCS1, Py_UCS4,
2212 PyUnicode_1BYTE_DATA(s),
2213 PyUnicode_1BYTE_DATA(s) + len,
2214 result);
2215 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002216 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002217 default:
2218 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002219 }
Victor Stinner01698042011-10-04 00:04:26 +02002220 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002221 return NULL;
2222}
2223
2224static Py_UCS4*
2225as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2226 int copy_null)
2227{
2228 int kind;
2229 void *data;
2230 Py_ssize_t len, targetlen;
2231 if (PyUnicode_READY(string) == -1)
2232 return NULL;
2233 kind = PyUnicode_KIND(string);
2234 data = PyUnicode_DATA(string);
2235 len = PyUnicode_GET_LENGTH(string);
2236 targetlen = len;
2237 if (copy_null)
2238 targetlen++;
2239 if (!target) {
2240 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
2241 PyErr_NoMemory();
2242 return NULL;
2243 }
2244 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
2245 if (!target) {
2246 PyErr_NoMemory();
2247 return NULL;
2248 }
2249 }
2250 else {
2251 if (targetsize < targetlen) {
2252 PyErr_Format(PyExc_SystemError,
2253 "string is longer than the buffer");
2254 if (copy_null && 0 < targetsize)
2255 target[0] = 0;
2256 return NULL;
2257 }
2258 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002259 if (kind == PyUnicode_1BYTE_KIND) {
2260 Py_UCS1 *start = (Py_UCS1 *) data;
2261 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002262 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002263 else if (kind == PyUnicode_2BYTE_KIND) {
2264 Py_UCS2 *start = (Py_UCS2 *) data;
2265 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2266 }
2267 else {
2268 assert(kind == PyUnicode_4BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002269 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002270 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002271 if (copy_null)
2272 target[len] = 0;
2273 return target;
2274}
2275
2276Py_UCS4*
2277PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2278 int copy_null)
2279{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002280 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002281 PyErr_BadInternalCall();
2282 return NULL;
2283 }
2284 return as_ucs4(string, target, targetsize, copy_null);
2285}
2286
2287Py_UCS4*
2288PyUnicode_AsUCS4Copy(PyObject *string)
2289{
2290 return as_ucs4(string, NULL, 0, 1);
2291}
2292
2293#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00002294
Alexander Belopolsky40018472011-02-26 01:02:56 +00002295PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02002296PyUnicode_FromWideChar(const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002297{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002298 if (w == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002299 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02002300 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson29060642009-01-31 22:14:21 +00002301 PyErr_BadInternalCall();
2302 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002303 }
2304
Martin v. Löwis790465f2008-04-05 20:41:37 +00002305 if (size == -1) {
2306 size = wcslen(w);
2307 }
2308
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002309 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002310}
2311
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002312#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00002313
Walter Dörwald346737f2007-05-31 10:44:43 +00002314static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002315makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
Victor Stinnere215d962012-10-06 23:03:36 +02002316 char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00002317{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002318 *fmt++ = '%';
Benjamin Peterson14339b62009-01-31 16:36:08 +00002319 if (longflag)
2320 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002321 else if (longlongflag) {
2322 /* longlongflag should only ever be nonzero on machines with
2323 HAVE_LONG_LONG defined */
2324#ifdef HAVE_LONG_LONG
2325 char *f = PY_FORMAT_LONG_LONG;
2326 while (*f)
2327 *fmt++ = *f++;
2328#else
2329 /* we shouldn't ever get here */
2330 assert(0);
2331 *fmt++ = 'l';
2332#endif
2333 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002334 else if (size_tflag) {
2335 char *f = PY_FORMAT_SIZE_T;
2336 while (*f)
2337 *fmt++ = *f++;
2338 }
2339 *fmt++ = c;
2340 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00002341}
2342
Victor Stinner15a11362012-10-06 23:48:20 +02002343/* maximum number of characters required for output of %lld or %p.
Victor Stinnere215d962012-10-06 23:03:36 +02002344 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2345 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2346#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
Victor Stinner96865452011-03-01 23:44:09 +00002347
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002348static int
2349unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2350 Py_ssize_t width, Py_ssize_t precision)
2351{
2352 Py_ssize_t length, fill, arglen;
2353 Py_UCS4 maxchar;
2354
2355 if (PyUnicode_READY(str) == -1)
2356 return -1;
2357
2358 length = PyUnicode_GET_LENGTH(str);
2359 if ((precision == -1 || precision >= length)
2360 && width <= length)
2361 return _PyUnicodeWriter_WriteStr(writer, str);
2362
2363 if (precision != -1)
2364 length = Py_MIN(precision, length);
2365
2366 arglen = Py_MAX(length, width);
2367 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2368 maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2369 else
2370 maxchar = writer->maxchar;
2371
2372 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2373 return -1;
2374
2375 if (width > length) {
2376 fill = width - length;
2377 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2378 return -1;
2379 writer->pos += fill;
2380 }
2381
2382 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2383 str, 0, length);
2384 writer->pos += length;
2385 return 0;
2386}
2387
2388static int
2389unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
2390 Py_ssize_t width, Py_ssize_t precision)
2391{
2392 /* UTF-8 */
2393 Py_ssize_t length;
2394 PyObject *unicode;
2395 int res;
2396
2397 length = strlen(str);
2398 if (precision != -1)
2399 length = Py_MIN(length, precision);
2400 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2401 if (unicode == NULL)
2402 return -1;
2403
2404 res = unicode_fromformat_write_str(writer, unicode, width, -1);
2405 Py_DECREF(unicode);
2406 return res;
2407}
2408
Victor Stinner96865452011-03-01 23:44:09 +00002409static const char*
Victor Stinnere215d962012-10-06 23:03:36 +02002410unicode_fromformat_arg(_PyUnicodeWriter *writer,
2411 const char *f, va_list *vargs)
Victor Stinner96865452011-03-01 23:44:09 +00002412{
Victor Stinnere215d962012-10-06 23:03:36 +02002413 const char *p;
2414 Py_ssize_t len;
2415 int zeropad;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002416 Py_ssize_t width;
2417 Py_ssize_t precision;
Victor Stinnere215d962012-10-06 23:03:36 +02002418 int longflag;
2419 int longlongflag;
2420 int size_tflag;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002421 Py_ssize_t fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002422
2423 p = f;
2424 f++;
Victor Stinner4c63a972012-10-06 23:55:33 +02002425 zeropad = 0;
2426 if (*f == '0') {
2427 zeropad = 1;
2428 f++;
2429 }
Victor Stinner96865452011-03-01 23:44:09 +00002430
2431 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002432 width = -1;
2433 if (Py_ISDIGIT((unsigned)*f)) {
2434 width = *f - '0';
Victor Stinner96865452011-03-01 23:44:09 +00002435 f++;
Victor Stinnere215d962012-10-06 23:03:36 +02002436 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002437 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
Victor Stinner3921e902012-10-06 23:05:00 +02002438 PyErr_SetString(PyExc_ValueError,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002439 "width too big");
Victor Stinner3921e902012-10-06 23:05:00 +02002440 return NULL;
2441 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002442 width = (width * 10) + (*f - '0');
Victor Stinnere215d962012-10-06 23:03:36 +02002443 f++;
2444 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002445 }
2446 precision = -1;
2447 if (*f == '.') {
2448 f++;
2449 if (Py_ISDIGIT((unsigned)*f)) {
2450 precision = (*f - '0');
2451 f++;
2452 while (Py_ISDIGIT((unsigned)*f)) {
2453 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2454 PyErr_SetString(PyExc_ValueError,
2455 "precision too big");
2456 return NULL;
2457 }
2458 precision = (precision * 10) + (*f - '0');
2459 f++;
2460 }
2461 }
Victor Stinner96865452011-03-01 23:44:09 +00002462 if (*f == '%') {
2463 /* "%.3%s" => f points to "3" */
2464 f--;
2465 }
2466 }
2467 if (*f == '\0') {
Victor Stinnere215d962012-10-06 23:03:36 +02002468 /* bogus format "%.123" => go backward, f points to "3" */
Victor Stinner96865452011-03-01 23:44:09 +00002469 f--;
2470 }
Victor Stinner96865452011-03-01 23:44:09 +00002471
2472 /* Handle %ld, %lu, %lld and %llu. */
2473 longflag = 0;
2474 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002475 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002476 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002477 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002478 longflag = 1;
2479 ++f;
2480 }
2481#ifdef HAVE_LONG_LONG
2482 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002483 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002484 longlongflag = 1;
2485 f += 2;
2486 }
2487#endif
2488 }
2489 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002490 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002491 size_tflag = 1;
2492 ++f;
2493 }
Victor Stinnere215d962012-10-06 23:03:36 +02002494
2495 if (f[1] == '\0')
2496 writer->overallocate = 0;
2497
2498 switch (*f) {
2499 case 'c':
2500 {
2501 int ordinal = va_arg(*vargs, int);
Victor Stinnerff5a8482012-10-06 23:05:45 +02002502 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Serhiy Storchakac89533f2013-06-23 20:21:16 +03002503 PyErr_SetString(PyExc_OverflowError,
Victor Stinnerff5a8482012-10-06 23:05:45 +02002504 "character argument not in range(0x110000)");
2505 return NULL;
2506 }
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002507 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002508 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002509 break;
2510 }
2511
2512 case 'i':
2513 case 'd':
2514 case 'u':
2515 case 'x':
2516 {
2517 /* used by sprintf */
2518 char fmt[10]; /* should be enough for "%0lld\0" */
Victor Stinner15a11362012-10-06 23:48:20 +02002519 char buffer[MAX_LONG_LONG_CHARS];
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002520 Py_ssize_t arglen;
Victor Stinnere215d962012-10-06 23:03:36 +02002521
2522 if (*f == 'u') {
2523 makefmt(fmt, longflag, longlongflag, size_tflag, *f);
2524
2525 if (longflag)
2526 len = sprintf(buffer, fmt,
2527 va_arg(*vargs, unsigned long));
2528#ifdef HAVE_LONG_LONG
2529 else if (longlongflag)
2530 len = sprintf(buffer, fmt,
2531 va_arg(*vargs, unsigned PY_LONG_LONG));
2532#endif
2533 else if (size_tflag)
2534 len = sprintf(buffer, fmt,
2535 va_arg(*vargs, size_t));
2536 else
2537 len = sprintf(buffer, fmt,
2538 va_arg(*vargs, unsigned int));
2539 }
2540 else if (*f == 'x') {
2541 makefmt(fmt, 0, 0, 0, 'x');
2542 len = sprintf(buffer, fmt, va_arg(*vargs, int));
2543 }
2544 else {
2545 makefmt(fmt, longflag, longlongflag, size_tflag, *f);
2546
2547 if (longflag)
2548 len = sprintf(buffer, fmt,
2549 va_arg(*vargs, long));
2550#ifdef HAVE_LONG_LONG
2551 else if (longlongflag)
2552 len = sprintf(buffer, fmt,
2553 va_arg(*vargs, PY_LONG_LONG));
2554#endif
2555 else if (size_tflag)
2556 len = sprintf(buffer, fmt,
2557 va_arg(*vargs, Py_ssize_t));
2558 else
2559 len = sprintf(buffer, fmt,
2560 va_arg(*vargs, int));
2561 }
2562 assert(len >= 0);
2563
Victor Stinnere215d962012-10-06 23:03:36 +02002564 if (precision < len)
2565 precision = len;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002566
2567 arglen = Py_MAX(precision, width);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002568 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2569 return NULL;
2570
Victor Stinnere215d962012-10-06 23:03:36 +02002571 if (width > precision) {
2572 Py_UCS4 fillchar;
2573 fill = width - precision;
2574 fillchar = zeropad?'0':' ';
Victor Stinner15a11362012-10-06 23:48:20 +02002575 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2576 return NULL;
2577 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002578 }
Victor Stinner15a11362012-10-06 23:48:20 +02002579 if (precision > len) {
Victor Stinnere215d962012-10-06 23:03:36 +02002580 fill = precision - len;
Victor Stinner15a11362012-10-06 23:48:20 +02002581 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2582 return NULL;
2583 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002584 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002585
Victor Stinner4a587072013-11-19 12:54:53 +01002586 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2587 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002588 break;
2589 }
2590
2591 case 'p':
2592 {
2593 char number[MAX_LONG_LONG_CHARS];
2594
2595 len = sprintf(number, "%p", va_arg(*vargs, void*));
2596 assert(len >= 0);
2597
2598 /* %p is ill-defined: ensure leading 0x. */
2599 if (number[1] == 'X')
2600 number[1] = 'x';
2601 else if (number[1] != 'x') {
2602 memmove(number + 2, number,
2603 strlen(number) + 1);
2604 number[0] = '0';
2605 number[1] = 'x';
2606 len += 2;
2607 }
2608
Victor Stinner4a587072013-11-19 12:54:53 +01002609 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002610 return NULL;
2611 break;
2612 }
2613
2614 case 's':
2615 {
2616 /* UTF-8 */
2617 const char *s = va_arg(*vargs, const char*);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002618 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002619 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002620 break;
2621 }
2622
2623 case 'U':
2624 {
2625 PyObject *obj = va_arg(*vargs, PyObject *);
2626 assert(obj && _PyUnicode_CHECK(obj));
2627
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002628 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002629 return NULL;
2630 break;
2631 }
2632
2633 case 'V':
2634 {
2635 PyObject *obj = va_arg(*vargs, PyObject *);
2636 const char *str = va_arg(*vargs, const char *);
Victor Stinnere215d962012-10-06 23:03:36 +02002637 if (obj) {
2638 assert(_PyUnicode_CHECK(obj));
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002639 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002640 return NULL;
2641 }
2642 else {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002643 assert(str != NULL);
2644 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002645 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002646 }
2647 break;
2648 }
2649
2650 case 'S':
2651 {
2652 PyObject *obj = va_arg(*vargs, PyObject *);
2653 PyObject *str;
2654 assert(obj);
2655 str = PyObject_Str(obj);
2656 if (!str)
2657 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002658 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002659 Py_DECREF(str);
2660 return NULL;
2661 }
2662 Py_DECREF(str);
2663 break;
2664 }
2665
2666 case 'R':
2667 {
2668 PyObject *obj = va_arg(*vargs, PyObject *);
2669 PyObject *repr;
2670 assert(obj);
2671 repr = PyObject_Repr(obj);
2672 if (!repr)
2673 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002674 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002675 Py_DECREF(repr);
2676 return NULL;
2677 }
2678 Py_DECREF(repr);
2679 break;
2680 }
2681
2682 case 'A':
2683 {
2684 PyObject *obj = va_arg(*vargs, PyObject *);
2685 PyObject *ascii;
2686 assert(obj);
2687 ascii = PyObject_ASCII(obj);
2688 if (!ascii)
2689 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002690 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002691 Py_DECREF(ascii);
2692 return NULL;
2693 }
2694 Py_DECREF(ascii);
2695 break;
2696 }
2697
2698 case '%':
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002699 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002700 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002701 break;
2702
2703 default:
2704 /* if we stumble upon an unknown formatting code, copy the rest
2705 of the format string to the output string. (we cannot just
2706 skip the code, since there's no way to know what's in the
2707 argument list) */
2708 len = strlen(p);
Victor Stinner4a587072013-11-19 12:54:53 +01002709 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002710 return NULL;
2711 f = p+len;
2712 return f;
2713 }
2714
2715 f++;
Victor Stinner96865452011-03-01 23:44:09 +00002716 return f;
2717}
2718
Walter Dörwaldd2034312007-05-18 16:29:38 +00002719PyObject *
2720PyUnicode_FromFormatV(const char *format, va_list vargs)
2721{
Victor Stinnere215d962012-10-06 23:03:36 +02002722 va_list vargs2;
2723 const char *f;
2724 _PyUnicodeWriter writer;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002725
Victor Stinner8f674cc2013-04-17 23:02:17 +02002726 _PyUnicodeWriter_Init(&writer);
2727 writer.min_length = strlen(format) + 100;
2728 writer.overallocate = 1;
Victor Stinnere215d962012-10-06 23:03:36 +02002729
2730 /* va_list may be an array (of 1 item) on some platforms (ex: AMD64).
2731 Copy it to be able to pass a reference to a subfunction. */
2732 Py_VA_COPY(vargs2, vargs);
2733
2734 for (f = format; *f; ) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002735 if (*f == '%') {
Victor Stinnere215d962012-10-06 23:03:36 +02002736 f = unicode_fromformat_arg(&writer, f, &vargs2);
2737 if (f == NULL)
2738 goto fail;
Victor Stinner1205f272010-09-11 00:54:47 +00002739 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002740 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002741 const char *p;
2742 Py_ssize_t len;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002743
Victor Stinnere215d962012-10-06 23:03:36 +02002744 p = f;
2745 do
2746 {
2747 if ((unsigned char)*p > 127) {
2748 PyErr_Format(PyExc_ValueError,
2749 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2750 "string, got a non-ASCII byte: 0x%02x",
2751 (unsigned char)*p);
2752 return NULL;
2753 }
2754 p++;
2755 }
2756 while (*p != '\0' && *p != '%');
2757 len = p - f;
2758
2759 if (*p == '\0')
2760 writer.overallocate = 0;
Victor Stinner4a587072013-11-19 12:54:53 +01002761
2762 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002763 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02002764
2765 f = p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002766 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002767 }
Victor Stinnere215d962012-10-06 23:03:36 +02002768 return _PyUnicodeWriter_Finish(&writer);
2769
2770 fail:
2771 _PyUnicodeWriter_Dealloc(&writer);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002772 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002773}
2774
Walter Dörwaldd2034312007-05-18 16:29:38 +00002775PyObject *
2776PyUnicode_FromFormat(const char *format, ...)
2777{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002778 PyObject* ret;
2779 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002780
2781#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002782 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002783#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002784 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002785#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002786 ret = PyUnicode_FromFormatV(format, vargs);
2787 va_end(vargs);
2788 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002789}
2790
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002791#ifdef HAVE_WCHAR_H
2792
Victor Stinner5593d8a2010-10-02 11:11:27 +00002793/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2794 convert a Unicode object to a wide character string.
2795
Victor Stinnerd88d9832011-09-06 02:00:05 +02002796 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002797 character) required to convert the unicode object. Ignore size argument.
2798
Victor Stinnerd88d9832011-09-06 02:00:05 +02002799 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002800 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002801 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002802static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002803unicode_aswidechar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002804 wchar_t *w,
2805 Py_ssize_t size)
2806{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002807 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002808 const wchar_t *wstr;
2809
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002810 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002811 if (wstr == NULL)
2812 return -1;
2813
Victor Stinner5593d8a2010-10-02 11:11:27 +00002814 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002815 if (size > res)
2816 size = res + 1;
2817 else
2818 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002819 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002820 return res;
2821 }
2822 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002823 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002824}
2825
2826Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002827PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002828 wchar_t *w,
2829 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002830{
2831 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002832 PyErr_BadInternalCall();
2833 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002834 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002835 return unicode_aswidechar(unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002836}
2837
Victor Stinner137c34c2010-09-29 10:25:54 +00002838wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002839PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002840 Py_ssize_t *size)
2841{
2842 wchar_t* buffer;
2843 Py_ssize_t buflen;
2844
2845 if (unicode == NULL) {
2846 PyErr_BadInternalCall();
2847 return NULL;
2848 }
2849
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002850 buflen = unicode_aswidechar(unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002851 if (buflen == -1)
2852 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002853 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002854 PyErr_NoMemory();
2855 return NULL;
2856 }
2857
Victor Stinner137c34c2010-09-29 10:25:54 +00002858 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2859 if (buffer == NULL) {
2860 PyErr_NoMemory();
2861 return NULL;
2862 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002863 buflen = unicode_aswidechar(unicode, buffer, buflen);
Stefan Krah8528c312012-08-19 21:52:43 +02002864 if (buflen == -1) {
2865 PyMem_FREE(buffer);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002866 return NULL;
Stefan Krah8528c312012-08-19 21:52:43 +02002867 }
Victor Stinner5593d8a2010-10-02 11:11:27 +00002868 if (size != NULL)
2869 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002870 return buffer;
2871}
2872
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002873#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002874
Alexander Belopolsky40018472011-02-26 01:02:56 +00002875PyObject *
2876PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002877{
Victor Stinner8faf8212011-12-08 22:14:11 +01002878 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002879 PyErr_SetString(PyExc_ValueError,
2880 "chr() arg not in range(0x110000)");
2881 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002882 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002883
Victor Stinner985a82a2014-01-03 12:53:47 +01002884 return unicode_char((Py_UCS4)ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002885}
2886
Alexander Belopolsky40018472011-02-26 01:02:56 +00002887PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02002888PyUnicode_FromObject(PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002889{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002890 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002891 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002892 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05002893 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002894 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002895 Py_INCREF(obj);
2896 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002897 }
2898 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002899 /* For a Unicode subtype that's not a Unicode object,
2900 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002901 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002902 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002903 PyErr_Format(PyExc_TypeError,
2904 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002905 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002906 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002907}
2908
Alexander Belopolsky40018472011-02-26 01:02:56 +00002909PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02002910PyUnicode_FromEncodedObject(PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002911 const char *encoding,
2912 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002913{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002914 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002915 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002916
Guido van Rossumd57fd912000-03-10 22:53:23 +00002917 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002918 PyErr_BadInternalCall();
2919 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002920 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002921
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002922 /* Decoding bytes objects is the most common case and should be fast */
2923 if (PyBytes_Check(obj)) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02002924 if (PyBytes_GET_SIZE(obj) == 0)
2925 _Py_RETURN_UNICODE_EMPTY();
2926 v = PyUnicode_Decode(
2927 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2928 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002929 return v;
2930 }
2931
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002932 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002933 PyErr_SetString(PyExc_TypeError,
2934 "decoding str is not supported");
2935 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002936 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002937
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002938 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2939 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2940 PyErr_Format(PyExc_TypeError,
2941 "coercing to str: need bytes, bytearray "
2942 "or buffer-like object, %.80s found",
2943 Py_TYPE(obj)->tp_name);
2944 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002945 }
Tim Petersced69f82003-09-16 20:30:58 +00002946
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002947 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02002948 PyBuffer_Release(&buffer);
2949 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00002950 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002951
Serhiy Storchaka05997252013-01-26 12:14:02 +02002952 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002953 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002954 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002955}
2956
Victor Stinner600d3be2010-06-10 12:00:55 +00002957/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002958 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2959 1 on success. */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01002960int
2961_Py_normalize_encoding(const char *encoding,
2962 char *lower,
2963 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002964{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002965 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002966 char *l;
2967 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002968
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04002969 if (encoding == NULL) {
Victor Stinner66b32702013-11-07 23:12:23 +01002970 /* 6 == strlen("utf-8") + 1 */
Victor Stinnerdf23e302013-11-07 13:33:36 +01002971 if (lower_len < 6)
2972 return 0;
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04002973 strcpy(lower, "utf-8");
2974 return 1;
2975 }
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002976 e = encoding;
2977 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002978 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002979 while (*e) {
2980 if (l == l_end)
2981 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002982 if (Py_ISUPPER(*e)) {
2983 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002984 }
2985 else if (*e == '_') {
2986 *l++ = '-';
2987 e++;
2988 }
2989 else {
2990 *l++ = *e++;
2991 }
2992 }
2993 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002994 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002995}
2996
Alexander Belopolsky40018472011-02-26 01:02:56 +00002997PyObject *
2998PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002999 Py_ssize_t size,
3000 const char *encoding,
3001 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00003002{
3003 PyObject *buffer = NULL, *unicode;
3004 Py_buffer info;
3005 char lower[11]; /* Enough for any encoding shortcut */
3006
Fred Drakee4315f52000-05-09 19:53:39 +00003007 /* Shortcuts for common default encodings */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003008 if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003009 if ((strcmp(lower, "utf-8") == 0) ||
3010 (strcmp(lower, "utf8") == 0))
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003011 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
Victor Stinner37296e82010-06-10 13:36:23 +00003012 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003013 (strcmp(lower, "latin1") == 0) ||
Victor Stinnerfa3ba4c2013-10-29 11:34:05 +01003014 (strcmp(lower, "iso-8859-1") == 0) ||
3015 (strcmp(lower, "iso8859-1") == 0))
Victor Stinner37296e82010-06-10 13:36:23 +00003016 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003017#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00003018 else if (strcmp(lower, "mbcs") == 0)
3019 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003020#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003021 else if (strcmp(lower, "ascii") == 0)
3022 return PyUnicode_DecodeASCII(s, size, errors);
3023 else if (strcmp(lower, "utf-16") == 0)
3024 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3025 else if (strcmp(lower, "utf-32") == 0)
3026 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3027 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003028
3029 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003030 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00003031 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003032 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003033 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003034 if (buffer == NULL)
3035 goto onError;
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003036 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003037 if (unicode == NULL)
3038 goto onError;
3039 if (!PyUnicode_Check(unicode)) {
3040 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003041 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3042 "use codecs.decode() to decode to arbitrary types",
3043 encoding,
3044 Py_TYPE(unicode)->tp_name, Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003045 Py_DECREF(unicode);
3046 goto onError;
3047 }
3048 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003049 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003050
Benjamin Peterson29060642009-01-31 22:14:21 +00003051 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003052 Py_XDECREF(buffer);
3053 return NULL;
3054}
3055
Alexander Belopolsky40018472011-02-26 01:02:56 +00003056PyObject *
3057PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003058 const char *encoding,
3059 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003060{
3061 PyObject *v;
3062
3063 if (!PyUnicode_Check(unicode)) {
3064 PyErr_BadArgument();
3065 goto onError;
3066 }
3067
3068 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003069 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003070
3071 /* Decode via the codec registry */
3072 v = PyCodec_Decode(unicode, encoding, errors);
3073 if (v == NULL)
3074 goto onError;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003075 return unicode_result(v);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003076
Benjamin Peterson29060642009-01-31 22:14:21 +00003077 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003078 return NULL;
3079}
3080
Alexander Belopolsky40018472011-02-26 01:02:56 +00003081PyObject *
3082PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003083 const char *encoding,
3084 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003085{
3086 PyObject *v;
3087
3088 if (!PyUnicode_Check(unicode)) {
3089 PyErr_BadArgument();
3090 goto onError;
3091 }
3092
3093 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003094 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003095
3096 /* Decode via the codec registry */
3097 v = PyCodec_Decode(unicode, encoding, errors);
3098 if (v == NULL)
3099 goto onError;
3100 if (!PyUnicode_Check(v)) {
3101 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003102 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3103 "use codecs.decode() to decode to arbitrary types",
3104 encoding,
3105 Py_TYPE(unicode)->tp_name, Py_TYPE(unicode)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003106 Py_DECREF(v);
3107 goto onError;
3108 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003109 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003110
Benjamin Peterson29060642009-01-31 22:14:21 +00003111 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003112 return NULL;
3113}
3114
Alexander Belopolsky40018472011-02-26 01:02:56 +00003115PyObject *
3116PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003117 Py_ssize_t size,
3118 const char *encoding,
3119 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003120{
3121 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003122
Guido van Rossumd57fd912000-03-10 22:53:23 +00003123 unicode = PyUnicode_FromUnicode(s, size);
3124 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003125 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003126 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3127 Py_DECREF(unicode);
3128 return v;
3129}
3130
Alexander Belopolsky40018472011-02-26 01:02:56 +00003131PyObject *
3132PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003133 const char *encoding,
3134 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003135{
3136 PyObject *v;
3137
3138 if (!PyUnicode_Check(unicode)) {
3139 PyErr_BadArgument();
3140 goto onError;
3141 }
3142
3143 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003144 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003145
3146 /* Encode via the codec registry */
3147 v = PyCodec_Encode(unicode, encoding, errors);
3148 if (v == NULL)
3149 goto onError;
3150 return v;
3151
Benjamin Peterson29060642009-01-31 22:14:21 +00003152 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003153 return NULL;
3154}
3155
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003156static size_t
3157wcstombs_errorpos(const wchar_t *wstr)
3158{
3159 size_t len;
3160#if SIZEOF_WCHAR_T == 2
3161 wchar_t buf[3];
3162#else
3163 wchar_t buf[2];
3164#endif
3165 char outbuf[MB_LEN_MAX];
3166 const wchar_t *start, *previous;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003167
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003168#if SIZEOF_WCHAR_T == 2
3169 buf[2] = 0;
3170#else
3171 buf[1] = 0;
3172#endif
3173 start = wstr;
3174 while (*wstr != L'\0')
3175 {
3176 previous = wstr;
3177#if SIZEOF_WCHAR_T == 2
3178 if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
3179 && Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
3180 {
3181 buf[0] = wstr[0];
3182 buf[1] = wstr[1];
3183 wstr += 2;
3184 }
3185 else {
3186 buf[0] = *wstr;
3187 buf[1] = 0;
3188 wstr++;
3189 }
3190#else
3191 buf[0] = *wstr;
3192 wstr++;
3193#endif
3194 len = wcstombs(outbuf, buf, sizeof(outbuf));
Victor Stinner2f197072011-12-17 07:08:30 +01003195 if (len == (size_t)-1)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003196 return previous - start;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003197 }
3198
3199 /* failed to find the unencodable character */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003200 return 0;
3201}
3202
Victor Stinner1b579672011-12-17 05:47:23 +01003203static int
3204locale_error_handler(const char *errors, int *surrogateescape)
3205{
3206 if (errors == NULL) {
3207 *surrogateescape = 0;
3208 return 0;
3209 }
3210
3211 if (strcmp(errors, "strict") == 0) {
3212 *surrogateescape = 0;
3213 return 0;
3214 }
Victor Stinner8dbd4212012-12-04 09:30:24 +01003215 if (strcmp(errors, "surrogateescape") == 0) {
Victor Stinner1b579672011-12-17 05:47:23 +01003216 *surrogateescape = 1;
3217 return 0;
3218 }
3219 PyErr_Format(PyExc_ValueError,
3220 "only 'strict' and 'surrogateescape' error handlers "
3221 "are supported, not '%s'",
3222 errors);
3223 return -1;
3224}
3225
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003226PyObject *
Victor Stinner1b579672011-12-17 05:47:23 +01003227PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003228{
3229 Py_ssize_t wlen, wlen2;
3230 wchar_t *wstr;
3231 PyObject *bytes = NULL;
3232 char *errmsg;
Raymond Hettingere56666d2013-08-04 11:51:03 -07003233 PyObject *reason = NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003234 PyObject *exc;
3235 size_t error_pos;
Victor Stinner1b579672011-12-17 05:47:23 +01003236 int surrogateescape;
3237
3238 if (locale_error_handler(errors, &surrogateescape) < 0)
3239 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003240
3241 wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3242 if (wstr == NULL)
3243 return NULL;
3244
3245 wlen2 = wcslen(wstr);
3246 if (wlen2 != wlen) {
3247 PyMem_Free(wstr);
3248 PyErr_SetString(PyExc_TypeError, "embedded null character");
3249 return NULL;
3250 }
3251
3252 if (surrogateescape) {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003253 /* "surrogateescape" error handler */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003254 char *str;
3255
3256 str = _Py_wchar2char(wstr, &error_pos);
3257 if (str == NULL) {
3258 if (error_pos == (size_t)-1) {
3259 PyErr_NoMemory();
3260 PyMem_Free(wstr);
3261 return NULL;
3262 }
3263 else {
3264 goto encode_error;
3265 }
3266 }
3267 PyMem_Free(wstr);
3268
3269 bytes = PyBytes_FromString(str);
3270 PyMem_Free(str);
3271 }
3272 else {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003273 /* strict mode */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003274 size_t len, len2;
3275
3276 len = wcstombs(NULL, wstr, 0);
3277 if (len == (size_t)-1) {
Victor Stinner2f197072011-12-17 07:08:30 +01003278 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003279 goto encode_error;
3280 }
3281
3282 bytes = PyBytes_FromStringAndSize(NULL, len);
3283 if (bytes == NULL) {
3284 PyMem_Free(wstr);
3285 return NULL;
3286 }
3287
3288 len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
3289 if (len2 == (size_t)-1 || len2 > len) {
Victor Stinner2f197072011-12-17 07:08:30 +01003290 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003291 goto encode_error;
3292 }
3293 PyMem_Free(wstr);
3294 }
3295 return bytes;
3296
3297encode_error:
3298 errmsg = strerror(errno);
3299 assert(errmsg != NULL);
Victor Stinner2f197072011-12-17 07:08:30 +01003300
3301 if (error_pos == (size_t)-1)
3302 error_pos = wcstombs_errorpos(wstr);
3303
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003304 PyMem_Free(wstr);
3305 Py_XDECREF(bytes);
3306
Victor Stinner2f197072011-12-17 07:08:30 +01003307 if (errmsg != NULL) {
3308 size_t errlen;
3309 wstr = _Py_char2wchar(errmsg, &errlen);
3310 if (wstr != NULL) {
3311 reason = PyUnicode_FromWideChar(wstr, errlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003312 PyMem_RawFree(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003313 } else
3314 errmsg = NULL;
3315 }
3316 if (errmsg == NULL)
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003317 reason = PyUnicode_FromString(
3318 "wcstombs() encountered an unencodable "
3319 "wide character");
3320 if (reason == NULL)
3321 return NULL;
3322
3323 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO",
3324 "locale", unicode,
3325 (Py_ssize_t)error_pos,
3326 (Py_ssize_t)(error_pos+1),
3327 reason);
3328 Py_DECREF(reason);
3329 if (exc != NULL) {
3330 PyCodec_StrictErrors(exc);
3331 Py_XDECREF(exc);
3332 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003333 return NULL;
3334}
3335
Victor Stinnerad158722010-10-27 00:25:46 +00003336PyObject *
3337PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003338{
Victor Stinner99b95382011-07-04 14:23:54 +02003339#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003340 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003341#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003342 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00003343#else
Victor Stinner793b5312011-04-27 00:24:21 +02003344 PyInterpreterState *interp = PyThreadState_GET()->interp;
3345 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3346 cannot use it to encode and decode filenames before it is loaded. Load
3347 the Python codec requires to encode at least its own filename. Use the C
3348 version of the locale codec until the codec registry is initialized and
3349 the Python codec is loaded.
3350
3351 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3352 cannot only rely on it: check also interp->fscodec_initialized for
3353 subinterpreters. */
3354 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003355 return PyUnicode_AsEncodedString(unicode,
3356 Py_FileSystemDefaultEncoding,
3357 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003358 }
3359 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003360 return PyUnicode_EncodeLocale(unicode, "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003361 }
Victor Stinnerad158722010-10-27 00:25:46 +00003362#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003363}
3364
Alexander Belopolsky40018472011-02-26 01:02:56 +00003365PyObject *
3366PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003367 const char *encoding,
3368 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003369{
3370 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00003371 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003372
Guido van Rossumd57fd912000-03-10 22:53:23 +00003373 if (!PyUnicode_Check(unicode)) {
3374 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003375 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003376 }
Fred Drakee4315f52000-05-09 19:53:39 +00003377
Fred Drakee4315f52000-05-09 19:53:39 +00003378 /* Shortcuts for common default encodings */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003379 if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003380 if ((strcmp(lower, "utf-8") == 0) ||
3381 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00003382 {
Victor Stinner2f283c22011-03-02 01:21:46 +00003383 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003384 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00003385 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003386 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00003387 }
Victor Stinner37296e82010-06-10 13:36:23 +00003388 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003389 (strcmp(lower, "latin1") == 0) ||
Victor Stinnerfa3ba4c2013-10-29 11:34:05 +01003390 (strcmp(lower, "iso-8859-1") == 0) ||
3391 (strcmp(lower, "iso8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003392 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003393#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003394 else if (strcmp(lower, "mbcs") == 0)
3395 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003396#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003397 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003398 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00003399 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003400
3401 /* Encode via the codec registry */
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003402 v = _PyCodec_EncodeText(unicode, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003403 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003404 return NULL;
3405
3406 /* The normal path */
3407 if (PyBytes_Check(v))
3408 return v;
3409
3410 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003411 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003412 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003413 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003414
3415 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003416 "encoder %s returned bytearray instead of bytes; "
3417 "use codecs.encode() to encode to arbitrary types",
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003418 encoding);
3419 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003420 Py_DECREF(v);
3421 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003422 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003423
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003424 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3425 Py_DECREF(v);
3426 return b;
3427 }
3428
3429 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003430 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
3431 "use codecs.encode() to encode to arbitrary types",
3432 encoding,
3433 Py_TYPE(v)->tp_name, Py_TYPE(v)->tp_name);
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003434 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003435 return NULL;
3436}
3437
Alexander Belopolsky40018472011-02-26 01:02:56 +00003438PyObject *
3439PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003440 const char *encoding,
3441 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003442{
3443 PyObject *v;
3444
3445 if (!PyUnicode_Check(unicode)) {
3446 PyErr_BadArgument();
3447 goto onError;
3448 }
3449
3450 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003451 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003452
3453 /* Encode via the codec registry */
3454 v = PyCodec_Encode(unicode, encoding, errors);
3455 if (v == NULL)
3456 goto onError;
3457 if (!PyUnicode_Check(v)) {
3458 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003459 "'%.400s' encoder returned '%.400s' instead of 'str'; "
3460 "use codecs.encode() to encode to arbitrary types",
3461 encoding,
3462 Py_TYPE(v)->tp_name, Py_TYPE(v)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003463 Py_DECREF(v);
3464 goto onError;
3465 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003466 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003467
Benjamin Peterson29060642009-01-31 22:14:21 +00003468 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003469 return NULL;
3470}
3471
Victor Stinner2f197072011-12-17 07:08:30 +01003472static size_t
3473mbstowcs_errorpos(const char *str, size_t len)
3474{
3475#ifdef HAVE_MBRTOWC
3476 const char *start = str;
3477 mbstate_t mbs;
3478 size_t converted;
3479 wchar_t ch;
3480
3481 memset(&mbs, 0, sizeof mbs);
3482 while (len)
3483 {
3484 converted = mbrtowc(&ch, (char*)str, len, &mbs);
3485 if (converted == 0)
3486 /* Reached end of string */
3487 break;
3488 if (converted == (size_t)-1 || converted == (size_t)-2) {
3489 /* Conversion error or incomplete character */
3490 return str - start;
3491 }
3492 else {
3493 str += converted;
3494 len -= converted;
3495 }
3496 }
3497 /* failed to find the undecodable byte sequence */
3498 return 0;
3499#endif
3500 return 0;
3501}
3502
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003503PyObject*
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003504PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
Victor Stinner1b579672011-12-17 05:47:23 +01003505 const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003506{
3507 wchar_t smallbuf[256];
3508 size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
3509 wchar_t *wstr;
3510 size_t wlen, wlen2;
3511 PyObject *unicode;
Victor Stinner1b579672011-12-17 05:47:23 +01003512 int surrogateescape;
Victor Stinner2f197072011-12-17 07:08:30 +01003513 size_t error_pos;
3514 char *errmsg;
3515 PyObject *reason, *exc;
Victor Stinner1b579672011-12-17 05:47:23 +01003516
3517 if (locale_error_handler(errors, &surrogateescape) < 0)
3518 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003519
3520 if (str[len] != '\0' || len != strlen(str)) {
3521 PyErr_SetString(PyExc_TypeError, "embedded null character");
3522 return NULL;
3523 }
3524
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003525 if (surrogateescape) {
3526 /* "surrogateescape" error handler */
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003527 wstr = _Py_char2wchar(str, &wlen);
3528 if (wstr == NULL) {
3529 if (wlen == (size_t)-1)
3530 PyErr_NoMemory();
3531 else
3532 PyErr_SetFromErrno(PyExc_OSError);
3533 return NULL;
3534 }
3535
3536 unicode = PyUnicode_FromWideChar(wstr, wlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003537 PyMem_RawFree(wstr);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003538 }
3539 else {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003540 /* strict mode */
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003541#ifndef HAVE_BROKEN_MBSTOWCS
3542 wlen = mbstowcs(NULL, str, 0);
3543#else
3544 wlen = len;
3545#endif
Victor Stinner2f197072011-12-17 07:08:30 +01003546 if (wlen == (size_t)-1)
3547 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003548 if (wlen+1 <= smallbuf_len) {
3549 wstr = smallbuf;
3550 }
3551 else {
3552 if (wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1)
3553 return PyErr_NoMemory();
3554
3555 wstr = PyMem_Malloc((wlen+1) * sizeof(wchar_t));
3556 if (!wstr)
3557 return PyErr_NoMemory();
3558 }
3559
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003560 wlen2 = mbstowcs(wstr, str, wlen+1);
3561 if (wlen2 == (size_t)-1) {
3562 if (wstr != smallbuf)
3563 PyMem_Free(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003564 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003565 }
3566#ifdef HAVE_BROKEN_MBSTOWCS
3567 assert(wlen2 == wlen);
3568#endif
3569 unicode = PyUnicode_FromWideChar(wstr, wlen2);
3570 if (wstr != smallbuf)
3571 PyMem_Free(wstr);
3572 }
3573 return unicode;
Victor Stinner2f197072011-12-17 07:08:30 +01003574
3575decode_error:
3576 errmsg = strerror(errno);
3577 assert(errmsg != NULL);
3578
3579 error_pos = mbstowcs_errorpos(str, len);
3580 if (errmsg != NULL) {
3581 size_t errlen;
3582 wstr = _Py_char2wchar(errmsg, &errlen);
3583 if (wstr != NULL) {
3584 reason = PyUnicode_FromWideChar(wstr, errlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003585 PyMem_RawFree(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003586 } else
3587 errmsg = NULL;
3588 }
3589 if (errmsg == NULL)
3590 reason = PyUnicode_FromString(
3591 "mbstowcs() encountered an invalid multibyte sequence");
3592 if (reason == NULL)
3593 return NULL;
3594
3595 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO",
3596 "locale", str, len,
3597 (Py_ssize_t)error_pos,
3598 (Py_ssize_t)(error_pos+1),
3599 reason);
3600 Py_DECREF(reason);
3601 if (exc != NULL) {
3602 PyCodec_StrictErrors(exc);
3603 Py_XDECREF(exc);
3604 }
3605 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003606}
3607
3608PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003609PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003610{
3611 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner1b579672011-12-17 05:47:23 +01003612 return PyUnicode_DecodeLocaleAndSize(str, size, errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003613}
3614
3615
3616PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003617PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003618 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003619 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3620}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003621
Christian Heimes5894ba72007-11-04 11:43:14 +00003622PyObject*
3623PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3624{
Victor Stinner99b95382011-07-04 14:23:54 +02003625#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003626 return PyUnicode_DecodeMBCS(s, size, NULL);
3627#elif defined(__APPLE__)
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003628 return PyUnicode_DecodeUTF8Stateful(s, size, "surrogateescape", NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003629#else
Victor Stinner793b5312011-04-27 00:24:21 +02003630 PyInterpreterState *interp = PyThreadState_GET()->interp;
3631 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3632 cannot use it to encode and decode filenames before it is loaded. Load
3633 the Python codec requires to encode at least its own filename. Use the C
3634 version of the locale codec until the codec registry is initialized and
3635 the Python codec is loaded.
3636
3637 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3638 cannot only rely on it: check also interp->fscodec_initialized for
3639 subinterpreters. */
3640 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003641 return PyUnicode_Decode(s, size,
3642 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00003643 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003644 }
3645 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003646 return PyUnicode_DecodeLocaleAndSize(s, size, "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003647 }
Victor Stinnerad158722010-10-27 00:25:46 +00003648#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003649}
3650
Martin v. Löwis011e8422009-05-05 04:43:17 +00003651
3652int
Victor Stinnerfe75fb42012-10-23 02:52:18 +02003653_PyUnicode_HasNULChars(PyObject* str)
Antoine Pitrou13348842012-01-29 18:36:34 +01003654{
Victor Stinnerfe75fb42012-10-23 02:52:18 +02003655 Py_ssize_t pos;
Antoine Pitrou13348842012-01-29 18:36:34 +01003656
Victor Stinnerfe75fb42012-10-23 02:52:18 +02003657 if (PyUnicode_READY(str) == -1)
Antoine Pitrou13348842012-01-29 18:36:34 +01003658 return -1;
Victor Stinnerfe75fb42012-10-23 02:52:18 +02003659 pos = findchar(PyUnicode_DATA(str), PyUnicode_KIND(str),
3660 PyUnicode_GET_LENGTH(str), '\0', 1);
3661 if (pos == -1)
3662 return 0;
3663 else
3664 return 1;
Antoine Pitrou13348842012-01-29 18:36:34 +01003665}
3666
Antoine Pitrou13348842012-01-29 18:36:34 +01003667int
Martin v. Löwis011e8422009-05-05 04:43:17 +00003668PyUnicode_FSConverter(PyObject* arg, void* addr)
3669{
3670 PyObject *output = NULL;
3671 Py_ssize_t size;
3672 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003673 if (arg == NULL) {
3674 Py_DECREF(*(PyObject**)addr);
3675 return 1;
3676 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00003677 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00003678 output = arg;
3679 Py_INCREF(output);
3680 }
3681 else {
3682 arg = PyUnicode_FromObject(arg);
3683 if (!arg)
3684 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00003685 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003686 Py_DECREF(arg);
3687 if (!output)
3688 return 0;
3689 if (!PyBytes_Check(output)) {
3690 Py_DECREF(output);
3691 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3692 return 0;
3693 }
3694 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00003695 size = PyBytes_GET_SIZE(output);
3696 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003697 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05003698 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003699 Py_DECREF(output);
3700 return 0;
3701 }
3702 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003703 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003704}
3705
3706
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003707int
3708PyUnicode_FSDecoder(PyObject* arg, void* addr)
3709{
3710 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003711 if (arg == NULL) {
3712 Py_DECREF(*(PyObject**)addr);
3713 return 1;
3714 }
3715 if (PyUnicode_Check(arg)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003716 if (PyUnicode_READY(arg) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003717 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003718 output = arg;
3719 Py_INCREF(output);
3720 }
3721 else {
3722 arg = PyBytes_FromObject(arg);
3723 if (!arg)
3724 return 0;
3725 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3726 PyBytes_GET_SIZE(arg));
3727 Py_DECREF(arg);
3728 if (!output)
3729 return 0;
3730 if (!PyUnicode_Check(output)) {
3731 Py_DECREF(output);
3732 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3733 return 0;
3734 }
3735 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003736 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003737 Py_DECREF(output);
3738 return 0;
3739 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003740 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003741 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003742 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3743 Py_DECREF(output);
3744 return 0;
3745 }
3746 *(PyObject**)addr = output;
3747 return Py_CLEANUP_SUPPORTED;
3748}
3749
3750
Martin v. Löwis5b222132007-06-10 09:51:05 +00003751char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003752PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003753{
Christian Heimesf3863112007-11-22 07:46:41 +00003754 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003755
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003756 if (!PyUnicode_Check(unicode)) {
3757 PyErr_BadArgument();
3758 return NULL;
3759 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003760 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003761 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003762
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003763 if (PyUnicode_UTF8(unicode) == NULL) {
3764 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003765 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3766 if (bytes == NULL)
3767 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003768 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3769 if (_PyUnicode_UTF8(unicode) == NULL) {
Victor Stinnera5afb582013-10-29 01:28:23 +01003770 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003771 Py_DECREF(bytes);
3772 return NULL;
3773 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003774 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3775 Py_MEMCPY(_PyUnicode_UTF8(unicode),
3776 PyBytes_AS_STRING(bytes),
3777 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003778 Py_DECREF(bytes);
3779 }
3780
3781 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003782 *psize = PyUnicode_UTF8_LENGTH(unicode);
3783 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003784}
3785
3786char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003787PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003788{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003789 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3790}
3791
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003792Py_UNICODE *
3793PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3794{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003795 const unsigned char *one_byte;
3796#if SIZEOF_WCHAR_T == 4
3797 const Py_UCS2 *two_bytes;
3798#else
3799 const Py_UCS4 *four_bytes;
3800 const Py_UCS4 *ucs4_end;
3801 Py_ssize_t num_surrogates;
3802#endif
3803 wchar_t *w;
3804 wchar_t *wchar_end;
3805
3806 if (!PyUnicode_Check(unicode)) {
3807 PyErr_BadArgument();
3808 return NULL;
3809 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003810 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003811 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003812 assert(_PyUnicode_KIND(unicode) != 0);
3813 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003814
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003815 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003816#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003817 four_bytes = PyUnicode_4BYTE_DATA(unicode);
3818 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003819 num_surrogates = 0;
3820
3821 for (; four_bytes < ucs4_end; ++four_bytes) {
3822 if (*four_bytes > 0xFFFF)
3823 ++num_surrogates;
3824 }
3825
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003826 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
3827 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
3828 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003829 PyErr_NoMemory();
3830 return NULL;
3831 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003832 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003833
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003834 w = _PyUnicode_WSTR(unicode);
3835 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
3836 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003837 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3838 if (*four_bytes > 0xFFFF) {
Victor Stinner8faf8212011-12-08 22:14:11 +01003839 assert(*four_bytes <= MAX_UNICODE);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003840 /* encode surrogate pair in this case */
Victor Stinner551ac952011-11-29 22:58:13 +01003841 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
3842 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003843 }
3844 else
3845 *w = *four_bytes;
3846
3847 if (w > wchar_end) {
3848 assert(0 && "Miscalculated string end");
3849 }
3850 }
3851 *w = 0;
3852#else
3853 /* sizeof(wchar_t) == 4 */
3854 Py_FatalError("Impossible unicode object state, wstr and str "
3855 "should share memory already.");
3856 return NULL;
3857#endif
3858 }
3859 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003860 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3861 (_PyUnicode_LENGTH(unicode) + 1));
3862 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003863 PyErr_NoMemory();
3864 return NULL;
3865 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003866 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
3867 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
3868 w = _PyUnicode_WSTR(unicode);
3869 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003870
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003871 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3872 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003873 for (; w < wchar_end; ++one_byte, ++w)
3874 *w = *one_byte;
3875 /* null-terminate the wstr */
3876 *w = 0;
3877 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003878 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003879#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003880 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003881 for (; w < wchar_end; ++two_bytes, ++w)
3882 *w = *two_bytes;
3883 /* null-terminate the wstr */
3884 *w = 0;
3885#else
3886 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003887 PyObject_FREE(_PyUnicode_WSTR(unicode));
3888 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003889 Py_FatalError("Impossible unicode object state, wstr "
3890 "and str should share memory already.");
3891 return NULL;
3892#endif
3893 }
3894 else {
3895 assert(0 && "This should never happen.");
3896 }
3897 }
3898 }
3899 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003900 *size = PyUnicode_WSTR_LENGTH(unicode);
3901 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003902}
3903
Alexander Belopolsky40018472011-02-26 01:02:56 +00003904Py_UNICODE *
3905PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003906{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003907 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003908}
3909
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003910
Alexander Belopolsky40018472011-02-26 01:02:56 +00003911Py_ssize_t
3912PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003913{
3914 if (!PyUnicode_Check(unicode)) {
3915 PyErr_BadArgument();
3916 goto onError;
3917 }
3918 return PyUnicode_GET_SIZE(unicode);
3919
Benjamin Peterson29060642009-01-31 22:14:21 +00003920 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003921 return -1;
3922}
3923
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003924Py_ssize_t
3925PyUnicode_GetLength(PyObject *unicode)
3926{
Victor Stinner07621332012-06-16 04:53:46 +02003927 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003928 PyErr_BadArgument();
3929 return -1;
3930 }
Victor Stinner07621332012-06-16 04:53:46 +02003931 if (PyUnicode_READY(unicode) == -1)
3932 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003933 return PyUnicode_GET_LENGTH(unicode);
3934}
3935
3936Py_UCS4
3937PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3938{
Victor Stinner69ed0f42013-04-09 21:48:24 +02003939 void *data;
3940 int kind;
3941
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003942 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3943 PyErr_BadArgument();
3944 return (Py_UCS4)-1;
3945 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01003946 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003947 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003948 return (Py_UCS4)-1;
3949 }
Victor Stinner69ed0f42013-04-09 21:48:24 +02003950 data = PyUnicode_DATA(unicode);
3951 kind = PyUnicode_KIND(unicode);
3952 return PyUnicode_READ(kind, data, index);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003953}
3954
3955int
3956PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3957{
3958 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003959 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003960 return -1;
3961 }
Victor Stinner488fa492011-12-12 00:01:39 +01003962 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01003963 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003964 PyErr_SetString(PyExc_IndexError, "string index out of range");
3965 return -1;
3966 }
Victor Stinner488fa492011-12-12 00:01:39 +01003967 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02003968 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01003969 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
3970 PyErr_SetString(PyExc_ValueError, "character out of range");
3971 return -1;
3972 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003973 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3974 index, ch);
3975 return 0;
3976}
3977
Alexander Belopolsky40018472011-02-26 01:02:56 +00003978const char *
3979PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00003980{
Victor Stinner42cb4622010-09-01 19:39:01 +00003981 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00003982}
3983
Victor Stinner554f3f02010-06-16 23:33:54 +00003984/* create or adjust a UnicodeDecodeError */
3985static void
3986make_decode_exception(PyObject **exceptionObject,
3987 const char *encoding,
3988 const char *input, Py_ssize_t length,
3989 Py_ssize_t startpos, Py_ssize_t endpos,
3990 const char *reason)
3991{
3992 if (*exceptionObject == NULL) {
3993 *exceptionObject = PyUnicodeDecodeError_Create(
3994 encoding, input, length, startpos, endpos, reason);
3995 }
3996 else {
3997 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
3998 goto onError;
3999 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4000 goto onError;
4001 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4002 goto onError;
4003 }
4004 return;
4005
4006onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02004007 Py_CLEAR(*exceptionObject);
Victor Stinner554f3f02010-06-16 23:33:54 +00004008}
4009
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004010#ifdef HAVE_MBCS
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004011/* error handling callback helper:
4012 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00004013 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004014 and adjust various state variables.
4015 return 0 on success, -1 on error
4016*/
4017
Alexander Belopolsky40018472011-02-26 01:02:56 +00004018static int
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004019unicode_decode_call_errorhandler_wchar(
4020 const char *errors, PyObject **errorHandler,
4021 const char *encoding, const char *reason,
4022 const char **input, const char **inend, Py_ssize_t *startinpos,
4023 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4024 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004025{
Benjamin Peterson142957c2008-07-04 19:55:29 +00004026 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004027
4028 PyObject *restuple = NULL;
4029 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01004030 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004031 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004032 Py_ssize_t requiredsize;
4033 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004034 PyObject *inputobj = NULL;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004035 wchar_t *repwstr;
4036 Py_ssize_t repwlen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004037
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004038 assert (_PyUnicode_KIND(*output) == PyUnicode_WCHAR_KIND);
4039 outsize = _PyUnicode_WSTR_LENGTH(*output);
Victor Stinner596a6c42011-11-09 00:02:18 +01004040
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004041 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004042 *errorHandler = PyCodec_LookupError(errors);
4043 if (*errorHandler == NULL)
4044 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004045 }
4046
Victor Stinner554f3f02010-06-16 23:33:54 +00004047 make_decode_exception(exceptionObject,
4048 encoding,
4049 *input, *inend - *input,
4050 *startinpos, *endinpos,
4051 reason);
4052 if (*exceptionObject == NULL)
4053 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004054
4055 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4056 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004057 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004058 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00004059 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004060 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004061 }
4062 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004063 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004064
4065 /* Copy back the bytes variables, which might have been modified by the
4066 callback */
4067 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4068 if (!inputobj)
4069 goto onError;
4070 if (!PyBytes_Check(inputobj)) {
4071 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
4072 }
4073 *input = PyBytes_AS_STRING(inputobj);
4074 insize = PyBytes_GET_SIZE(inputobj);
4075 *inend = *input + insize;
4076 /* we can DECREF safely, as the exception has another reference,
4077 so the object won't go away. */
4078 Py_DECREF(inputobj);
4079
4080 if (newpos<0)
4081 newpos = insize+newpos;
4082 if (newpos<0 || newpos>insize) {
4083 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4084 goto onError;
4085 }
4086
4087 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4088 if (repwstr == NULL)
4089 goto onError;
4090 /* need more space? (at least enough for what we
4091 have+the replacement+the rest of the string (starting
4092 at the new input position), so we won't have to check space
4093 when there are no errors in the rest of the string) */
4094 requiredsize = *outpos + repwlen + insize-newpos;
4095 if (requiredsize > outsize) {
4096 if (requiredsize < 2*outsize)
4097 requiredsize = 2*outsize;
4098 if (unicode_resize(output, requiredsize) < 0)
4099 goto onError;
4100 }
4101 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4102 *outpos += repwlen;
4103
4104 *endinpos = newpos;
4105 *inptr = *input + newpos;
4106
4107 /* we made it! */
4108 Py_XDECREF(restuple);
4109 return 0;
4110
4111 onError:
4112 Py_XDECREF(restuple);
4113 return -1;
4114}
4115#endif /* HAVE_MBCS */
4116
4117static int
4118unicode_decode_call_errorhandler_writer(
4119 const char *errors, PyObject **errorHandler,
4120 const char *encoding, const char *reason,
4121 const char **input, const char **inend, Py_ssize_t *startinpos,
4122 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4123 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4124{
4125 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
4126
4127 PyObject *restuple = NULL;
4128 PyObject *repunicode = NULL;
4129 Py_ssize_t insize;
4130 Py_ssize_t newpos;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004131 Py_ssize_t replen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004132 PyObject *inputobj = NULL;
4133
4134 if (*errorHandler == NULL) {
4135 *errorHandler = PyCodec_LookupError(errors);
4136 if (*errorHandler == NULL)
4137 goto onError;
4138 }
4139
4140 make_decode_exception(exceptionObject,
4141 encoding,
4142 *input, *inend - *input,
4143 *startinpos, *endinpos,
4144 reason);
4145 if (*exceptionObject == NULL)
4146 goto onError;
4147
4148 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4149 if (restuple == NULL)
4150 goto onError;
4151 if (!PyTuple_Check(restuple)) {
4152 PyErr_SetString(PyExc_TypeError, &argparse[4]);
4153 goto onError;
4154 }
4155 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004156 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004157
4158 /* Copy back the bytes variables, which might have been modified by the
4159 callback */
4160 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4161 if (!inputobj)
4162 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00004163 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004164 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00004165 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004166 *input = PyBytes_AS_STRING(inputobj);
4167 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004168 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004169 /* we can DECREF safely, as the exception has another reference,
4170 so the object won't go away. */
4171 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004172
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004173 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004174 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004175 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004176 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4177 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004178 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004179
Victor Stinner8f674cc2013-04-17 23:02:17 +02004180 if (PyUnicode_READY(repunicode) < 0)
4181 goto onError;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004182 replen = PyUnicode_GET_LENGTH(repunicode);
4183 writer->min_length += replen;
4184 if (replen > 1)
Victor Stinner8f674cc2013-04-17 23:02:17 +02004185 writer->overallocate = 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004186 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
Victor Stinner376cfa12013-04-17 23:58:16 +02004187 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004188
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004189 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004190 *inptr = *input + newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004191
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004192 /* we made it! */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004193 Py_XDECREF(restuple);
4194 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004195
Benjamin Peterson29060642009-01-31 22:14:21 +00004196 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004197 Py_XDECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004198 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004199}
4200
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004201/* --- UTF-7 Codec -------------------------------------------------------- */
4202
Antoine Pitrou244651a2009-05-04 18:56:13 +00004203/* See RFC2152 for details. We encode conservatively and decode liberally. */
4204
4205/* Three simple macros defining base-64. */
4206
4207/* Is c a base-64 character? */
4208
4209#define IS_BASE64(c) \
4210 (((c) >= 'A' && (c) <= 'Z') || \
4211 ((c) >= 'a' && (c) <= 'z') || \
4212 ((c) >= '0' && (c) <= '9') || \
4213 (c) == '+' || (c) == '/')
4214
4215/* given that c is a base-64 character, what is its base-64 value? */
4216
4217#define FROM_BASE64(c) \
4218 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4219 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4220 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4221 (c) == '+' ? 62 : 63)
4222
4223/* What is the base-64 character of the bottom 6 bits of n? */
4224
4225#define TO_BASE64(n) \
4226 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4227
4228/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4229 * decoded as itself. We are permissive on decoding; the only ASCII
4230 * byte not decoding to itself is the + which begins a base64
4231 * string. */
4232
4233#define DECODE_DIRECT(c) \
4234 ((c) <= 127 && (c) != '+')
4235
4236/* The UTF-7 encoder treats ASCII characters differently according to
4237 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4238 * the above). See RFC2152. This array identifies these different
4239 * sets:
4240 * 0 : "Set D"
4241 * alphanumeric and '(),-./:?
4242 * 1 : "Set O"
4243 * !"#$%&*;<=>@[]^_`{|}
4244 * 2 : "whitespace"
4245 * ht nl cr sp
4246 * 3 : special (must be base64 encoded)
4247 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4248 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004249
Tim Petersced69f82003-09-16 20:30:58 +00004250static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004251char utf7_category[128] = {
4252/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4253 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4254/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4255 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4256/* sp ! " # $ % & ' ( ) * + , - . / */
4257 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4258/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4259 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4260/* @ A B C D E F G H I J K L M N O */
4261 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4262/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4263 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4264/* ` a b c d e f g h i j k l m n o */
4265 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4266/* p q r s t u v w x y z { | } ~ del */
4267 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004268};
4269
Antoine Pitrou244651a2009-05-04 18:56:13 +00004270/* ENCODE_DIRECT: this character should be encoded as itself. The
4271 * answer depends on whether we are encoding set O as itself, and also
4272 * on whether we are encoding whitespace as itself. RFC2152 makes it
4273 * clear that the answers to these questions vary between
4274 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004275
Antoine Pitrou244651a2009-05-04 18:56:13 +00004276#define ENCODE_DIRECT(c, directO, directWS) \
4277 ((c) < 128 && (c) > 0 && \
4278 ((utf7_category[(c)] == 0) || \
4279 (directWS && (utf7_category[(c)] == 2)) || \
4280 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004281
Alexander Belopolsky40018472011-02-26 01:02:56 +00004282PyObject *
4283PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004284 Py_ssize_t size,
4285 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004286{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004287 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4288}
4289
Antoine Pitrou244651a2009-05-04 18:56:13 +00004290/* The decoder. The only state we preserve is our read position,
4291 * i.e. how many characters we have consumed. So if we end in the
4292 * middle of a shift sequence we have to back off the read position
4293 * and the output to the beginning of the sequence, otherwise we lose
4294 * all the shift state (seen bits, number of bits seen, high
4295 * surrogate). */
4296
Alexander Belopolsky40018472011-02-26 01:02:56 +00004297PyObject *
4298PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004299 Py_ssize_t size,
4300 const char *errors,
4301 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004302{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004303 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004304 Py_ssize_t startinpos;
4305 Py_ssize_t endinpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004306 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004307 _PyUnicodeWriter writer;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004308 const char *errmsg = "";
4309 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004310 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004311 unsigned int base64bits = 0;
4312 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004313 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004314 PyObject *errorHandler = NULL;
4315 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004316
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004317 if (size == 0) {
4318 if (consumed)
4319 *consumed = 0;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004320 _Py_RETURN_UNICODE_EMPTY();
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004321 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004322
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004323 /* Start off assuming it's all ASCII. Widen later as necessary. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02004324 _PyUnicodeWriter_Init(&writer);
4325 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004326
4327 shiftOutStart = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004328 e = s + size;
4329
4330 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004331 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004332 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004333 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004334
Antoine Pitrou244651a2009-05-04 18:56:13 +00004335 if (inShift) { /* in a base-64 section */
4336 if (IS_BASE64(ch)) { /* consume a base-64 character */
4337 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4338 base64bits += 6;
4339 s++;
4340 if (base64bits >= 16) {
4341 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004342 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004343 base64bits -= 16;
4344 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004345 assert(outCh <= 0xffff);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004346 if (surrogate) {
4347 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004348 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4349 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004350 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004351 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004352 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004353 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004354 }
4355 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004356 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004357 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004358 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004359 }
4360 }
Victor Stinner551ac952011-11-29 22:58:13 +01004361 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004362 /* first surrogate */
4363 surrogate = outCh;
4364 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004365 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004366 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004367 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004368 }
4369 }
4370 }
4371 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004372 inShift = 0;
4373 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004374 if (surrogate) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004375 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004376 goto onError;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004377 surrogate = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004378 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004379 if (base64bits > 0) { /* left-over bits */
4380 if (base64bits >= 6) {
4381 /* We've seen at least one base-64 character */
4382 errmsg = "partial character in shift sequence";
4383 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004384 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004385 else {
4386 /* Some bits remain; they should be zero */
4387 if (base64buffer != 0) {
4388 errmsg = "non-zero padding bits in shift sequence";
4389 goto utf7Error;
4390 }
4391 }
4392 }
4393 if (ch != '-') {
4394 /* '-' is absorbed; other terminating
4395 characters are preserved */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004396 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004397 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004398 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004399 }
4400 }
4401 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004402 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004403 s++; /* consume '+' */
4404 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004405 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004406 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004407 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004408 }
4409 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004410 inShift = 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004411 shiftOutStart = writer.pos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004412 base64bits = 0;
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004413 base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004414 }
4415 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004416 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004417 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004418 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004419 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004420 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004421 else {
4422 startinpos = s-starts;
4423 s++;
4424 errmsg = "unexpected special character";
4425 goto utf7Error;
4426 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004427 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004428utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004429 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004430 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004431 errors, &errorHandler,
4432 "utf7", errmsg,
4433 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004434 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004435 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004436 }
4437
Antoine Pitrou244651a2009-05-04 18:56:13 +00004438 /* end of string */
4439
4440 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4441 /* if we're in an inconsistent state, that's an error */
4442 if (surrogate ||
4443 (base64bits >= 6) ||
4444 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004445 endinpos = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004446 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrou244651a2009-05-04 18:56:13 +00004447 errors, &errorHandler,
4448 "utf7", "unterminated shift sequence",
4449 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004450 &writer))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004451 goto onError;
4452 if (s < e)
4453 goto restart;
4454 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004455 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004456
4457 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004458 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004459 if (inShift) {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004460 *consumed = startinpos;
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004461 if (writer.pos != shiftOutStart && writer.maxchar > 127) {
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004462 PyObject *result = PyUnicode_FromKindAndData(
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004463 writer.kind, writer.data, shiftOutStart);
4464 Py_XDECREF(errorHandler);
4465 Py_XDECREF(exc);
4466 _PyUnicodeWriter_Dealloc(&writer);
4467 return result;
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004468 }
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004469 writer.pos = shiftOutStart; /* back off output */
Antoine Pitrou244651a2009-05-04 18:56:13 +00004470 }
4471 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004472 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004473 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004474 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004475
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004476 Py_XDECREF(errorHandler);
4477 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004478 return _PyUnicodeWriter_Finish(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004479
Benjamin Peterson29060642009-01-31 22:14:21 +00004480 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004481 Py_XDECREF(errorHandler);
4482 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004483 _PyUnicodeWriter_Dealloc(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004484 return NULL;
4485}
4486
4487
Alexander Belopolsky40018472011-02-26 01:02:56 +00004488PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004489_PyUnicode_EncodeUTF7(PyObject *str,
4490 int base64SetO,
4491 int base64WhiteSpace,
4492 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004493{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004494 int kind;
4495 void *data;
4496 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004497 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004498 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004499 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004500 unsigned int base64bits = 0;
4501 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004502 char * out;
4503 char * start;
4504
Benjamin Petersonbac79492012-01-14 13:34:47 -05004505 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004506 return NULL;
4507 kind = PyUnicode_KIND(str);
4508 data = PyUnicode_DATA(str);
4509 len = PyUnicode_GET_LENGTH(str);
4510
4511 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004512 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004513
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004514 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004515 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004516 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004517 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004518 if (v == NULL)
4519 return NULL;
4520
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004521 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004522 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004523 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004524
Antoine Pitrou244651a2009-05-04 18:56:13 +00004525 if (inShift) {
4526 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4527 /* shifting out */
4528 if (base64bits) { /* output remaining bits */
4529 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4530 base64buffer = 0;
4531 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004532 }
4533 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004534 /* Characters not in the BASE64 set implicitly unshift the sequence
4535 so no '-' is required, except if the character is itself a '-' */
4536 if (IS_BASE64(ch) || ch == '-') {
4537 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004538 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004539 *out++ = (char) ch;
4540 }
4541 else {
4542 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004543 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004544 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004545 else { /* not in a shift sequence */
4546 if (ch == '+') {
4547 *out++ = '+';
4548 *out++ = '-';
4549 }
4550 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4551 *out++ = (char) ch;
4552 }
4553 else {
4554 *out++ = '+';
4555 inShift = 1;
4556 goto encode_char;
4557 }
4558 }
4559 continue;
4560encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004561 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004562 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004563
Antoine Pitrou244651a2009-05-04 18:56:13 +00004564 /* code first surrogate */
4565 base64bits += 16;
Victor Stinner76df43d2012-10-30 01:42:39 +01004566 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004567 while (base64bits >= 6) {
4568 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4569 base64bits -= 6;
4570 }
4571 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004572 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004573 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004574 base64bits += 16;
4575 base64buffer = (base64buffer << 16) | ch;
4576 while (base64bits >= 6) {
4577 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4578 base64bits -= 6;
4579 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004580 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004581 if (base64bits)
4582 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4583 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004584 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004585 if (_PyBytes_Resize(&v, out - start) < 0)
4586 return NULL;
4587 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004588}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004589PyObject *
4590PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4591 Py_ssize_t size,
4592 int base64SetO,
4593 int base64WhiteSpace,
4594 const char *errors)
4595{
4596 PyObject *result;
4597 PyObject *tmp = PyUnicode_FromUnicode(s, size);
4598 if (tmp == NULL)
4599 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004600 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004601 base64WhiteSpace, errors);
4602 Py_DECREF(tmp);
4603 return result;
4604}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004605
Antoine Pitrou244651a2009-05-04 18:56:13 +00004606#undef IS_BASE64
4607#undef FROM_BASE64
4608#undef TO_BASE64
4609#undef DECODE_DIRECT
4610#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004611
Guido van Rossumd57fd912000-03-10 22:53:23 +00004612/* --- UTF-8 Codec -------------------------------------------------------- */
4613
Alexander Belopolsky40018472011-02-26 01:02:56 +00004614PyObject *
4615PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004616 Py_ssize_t size,
4617 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004618{
Walter Dörwald69652032004-09-07 20:24:22 +00004619 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4620}
4621
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004622#include "stringlib/asciilib.h"
4623#include "stringlib/codecs.h"
4624#include "stringlib/undef.h"
4625
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004626#include "stringlib/ucs1lib.h"
4627#include "stringlib/codecs.h"
4628#include "stringlib/undef.h"
4629
4630#include "stringlib/ucs2lib.h"
4631#include "stringlib/codecs.h"
4632#include "stringlib/undef.h"
4633
4634#include "stringlib/ucs4lib.h"
4635#include "stringlib/codecs.h"
4636#include "stringlib/undef.h"
4637
Antoine Pitrouab868312009-01-10 15:40:25 +00004638/* Mask to quickly check whether a C 'long' contains a
4639 non-ASCII, UTF8-encoded char. */
4640#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004641# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004642#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004643# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004644#else
4645# error C 'long' size should be either 4 or 8!
4646#endif
4647
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004648static Py_ssize_t
4649ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004650{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004651 const char *p = start;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004652 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004653
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004654 /*
4655 * Issue #17237: m68k is a bit different from most architectures in
4656 * that objects do not use "natural alignment" - for example, int and
4657 * long are only aligned at 2-byte boundaries. Therefore the assert()
4658 * won't work; also, tests have shown that skipping the "optimised
4659 * version" will even speed up m68k.
4660 */
4661#if !defined(__m68k__)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004662#if SIZEOF_LONG <= SIZEOF_VOID_P
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004663 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4664 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004665 /* Fast path, see in STRINGLIB(utf8_decode) for
4666 an explanation. */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004667 /* Help allocation */
4668 const char *_p = p;
4669 Py_UCS1 * q = dest;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004670 while (_p < aligned_end) {
4671 unsigned long value = *(const unsigned long *) _p;
4672 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00004673 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004674 *((unsigned long *)q) = value;
4675 _p += SIZEOF_LONG;
4676 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004677 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004678 p = _p;
4679 while (p < end) {
4680 if ((unsigned char)*p & 0x80)
4681 break;
4682 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004683 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004684 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004685 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004686#endif
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004687#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004688 while (p < end) {
4689 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4690 for an explanation. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004691 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004692 /* Help allocation */
4693 const char *_p = p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004694 while (_p < aligned_end) {
4695 unsigned long value = *(unsigned long *) _p;
4696 if (value & ASCII_CHAR_MASK)
4697 break;
4698 _p += SIZEOF_LONG;
4699 }
4700 p = _p;
4701 if (_p == end)
4702 break;
4703 }
4704 if ((unsigned char)*p & 0x80)
4705 break;
4706 ++p;
4707 }
4708 memcpy(dest, start, p - start);
4709 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004710}
Antoine Pitrouab868312009-01-10 15:40:25 +00004711
Victor Stinner785938e2011-12-11 20:09:03 +01004712PyObject *
4713PyUnicode_DecodeUTF8Stateful(const char *s,
4714 Py_ssize_t size,
4715 const char *errors,
4716 Py_ssize_t *consumed)
4717{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004718 _PyUnicodeWriter writer;
Victor Stinner785938e2011-12-11 20:09:03 +01004719 const char *starts = s;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004720 const char *end = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004721
4722 Py_ssize_t startinpos;
4723 Py_ssize_t endinpos;
4724 const char *errmsg = "";
4725 PyObject *errorHandler = NULL;
4726 PyObject *exc = NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004727
4728 if (size == 0) {
4729 if (consumed)
4730 *consumed = 0;
Serhiy Storchaka678db842013-01-26 12:16:36 +02004731 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner785938e2011-12-11 20:09:03 +01004732 }
4733
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004734 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4735 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner785938e2011-12-11 20:09:03 +01004736 if (consumed)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004737 *consumed = 1;
4738 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01004739 }
4740
Victor Stinner8f674cc2013-04-17 23:02:17 +02004741 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02004742 writer.min_length = size;
4743 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004744 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004745
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004746 writer.pos = ascii_decode(s, end, writer.data);
4747 s += writer.pos;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004748 while (s < end) {
4749 Py_UCS4 ch;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004750 int kind = writer.kind;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004751 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004752 if (PyUnicode_IS_ASCII(writer.buffer))
4753 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004754 else
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004755 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004756 } else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004757 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004758 } else {
4759 assert(kind == PyUnicode_4BYTE_KIND);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004760 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004761 }
4762
4763 switch (ch) {
4764 case 0:
4765 if (s == end || consumed)
4766 goto End;
4767 errmsg = "unexpected end of data";
4768 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004769 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004770 break;
4771 case 1:
4772 errmsg = "invalid start byte";
4773 startinpos = s - starts;
4774 endinpos = startinpos + 1;
4775 break;
4776 case 2:
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004777 case 3:
4778 case 4:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004779 errmsg = "invalid continuation byte";
4780 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004781 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004782 break;
4783 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004784 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004785 goto onError;
4786 continue;
4787 }
4788
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004789 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004790 errors, &errorHandler,
4791 "utf-8", errmsg,
4792 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004793 &writer))
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004794 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004795 }
4796
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004797End:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004798 if (consumed)
4799 *consumed = s - starts;
4800
4801 Py_XDECREF(errorHandler);
4802 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004803 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004804
4805onError:
4806 Py_XDECREF(errorHandler);
4807 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004808 _PyUnicodeWriter_Dealloc(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004809 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004810}
4811
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004812#ifdef __APPLE__
4813
4814/* Simplified UTF-8 decoder using surrogateescape error handler,
Victor Stinner0d92c4f2012-11-12 23:32:21 +01004815 used to decode the command line arguments on Mac OS X.
4816
4817 Return a pointer to a newly allocated wide character string (use
Victor Stinner6f8eeee2013-07-07 22:57:45 +02004818 PyMem_RawFree() to free the memory), or NULL on memory allocation error. */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004819
4820wchar_t*
4821_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4822{
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004823 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004824 wchar_t *unicode;
4825 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004826
4827 /* Note: size will always be longer than the resulting Unicode
4828 character count */
Victor Stinner0d92c4f2012-11-12 23:32:21 +01004829 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1))
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004830 return NULL;
Victor Stinner6f8eeee2013-07-07 22:57:45 +02004831 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004832 if (!unicode)
4833 return NULL;
4834
4835 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004836 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004837 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004838 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004839 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004840#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004841 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004842#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004843 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004844#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004845 if (ch > 0xFF) {
4846#if SIZEOF_WCHAR_T == 4
4847 assert(0);
4848#else
4849 assert(Py_UNICODE_IS_SURROGATE(ch));
4850 /* compute and append the two surrogates: */
4851 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
4852 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
4853#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004854 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004855 else {
4856 if (!ch && s == e)
4857 break;
4858 /* surrogateescape */
4859 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
4860 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004861 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004862 unicode[outpos] = L'\0';
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004863 return unicode;
4864}
4865
4866#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00004867
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004868/* Primary internal function which creates utf8 encoded bytes objects.
4869
4870 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00004871 and allocate exactly as much space needed at the end. Else allocate the
4872 maximum possible needed (4 result bytes per Unicode character), and return
4873 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004874*/
Tim Peters7e3d9612002-04-21 03:26:37 +00004875PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01004876_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004877{
Victor Stinner6099a032011-12-18 14:22:26 +01004878 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004879 void *data;
4880 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00004881
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004882 if (!PyUnicode_Check(unicode)) {
4883 PyErr_BadArgument();
4884 return NULL;
4885 }
4886
4887 if (PyUnicode_READY(unicode) == -1)
4888 return NULL;
4889
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004890 if (PyUnicode_UTF8(unicode))
4891 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4892 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004893
4894 kind = PyUnicode_KIND(unicode);
4895 data = PyUnicode_DATA(unicode);
4896 size = PyUnicode_GET_LENGTH(unicode);
4897
Benjamin Petersonead6b532011-12-20 17:23:42 -06004898 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01004899 default:
4900 assert(0);
4901 case PyUnicode_1BYTE_KIND:
4902 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
4903 assert(!PyUnicode_IS_ASCII(unicode));
4904 return ucs1lib_utf8_encoder(unicode, data, size, errors);
4905 case PyUnicode_2BYTE_KIND:
4906 return ucs2lib_utf8_encoder(unicode, data, size, errors);
4907 case PyUnicode_4BYTE_KIND:
4908 return ucs4lib_utf8_encoder(unicode, data, size, errors);
Tim Peters602f7402002-04-27 18:03:26 +00004909 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004910}
4911
Alexander Belopolsky40018472011-02-26 01:02:56 +00004912PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004913PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4914 Py_ssize_t size,
4915 const char *errors)
4916{
4917 PyObject *v, *unicode;
4918
4919 unicode = PyUnicode_FromUnicode(s, size);
4920 if (unicode == NULL)
4921 return NULL;
4922 v = _PyUnicode_AsUTF8String(unicode, errors);
4923 Py_DECREF(unicode);
4924 return v;
4925}
4926
4927PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004928PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004929{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004930 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004931}
4932
Walter Dörwald41980ca2007-08-16 21:55:45 +00004933/* --- UTF-32 Codec ------------------------------------------------------- */
4934
4935PyObject *
4936PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004937 Py_ssize_t size,
4938 const char *errors,
4939 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004940{
4941 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4942}
4943
4944PyObject *
4945PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004946 Py_ssize_t size,
4947 const char *errors,
4948 int *byteorder,
4949 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004950{
4951 const char *starts = s;
4952 Py_ssize_t startinpos;
4953 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004954 _PyUnicodeWriter writer;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004955 const unsigned char *q, *e;
Victor Stinnere64322e2012-10-30 23:12:47 +01004956 int le, bo = 0; /* assume native ordering by default */
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02004957 const char *encoding;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004958 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004959 PyObject *errorHandler = NULL;
4960 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004961
Walter Dörwald41980ca2007-08-16 21:55:45 +00004962 q = (unsigned char *)s;
4963 e = q + size;
4964
4965 if (byteorder)
4966 bo = *byteorder;
4967
4968 /* Check for BOM marks (U+FEFF) in the input and adjust current
4969 byte order setting accordingly. In native mode, the leading BOM
4970 mark is skipped, in all other modes, it is copied to the output
4971 stream as-is (giving a ZWNBSP character). */
Victor Stinnere64322e2012-10-30 23:12:47 +01004972 if (bo == 0 && size >= 4) {
4973 Py_UCS4 bom = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
4974 if (bom == 0x0000FEFF) {
4975 bo = -1;
4976 q += 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00004977 }
Victor Stinnere64322e2012-10-30 23:12:47 +01004978 else if (bom == 0xFFFE0000) {
4979 bo = 1;
4980 q += 4;
4981 }
4982 if (byteorder)
4983 *byteorder = bo;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004984 }
4985
Victor Stinnere64322e2012-10-30 23:12:47 +01004986 if (q == e) {
4987 if (consumed)
4988 *consumed = size;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004989 _Py_RETURN_UNICODE_EMPTY();
Walter Dörwald41980ca2007-08-16 21:55:45 +00004990 }
4991
Victor Stinnere64322e2012-10-30 23:12:47 +01004992#ifdef WORDS_BIGENDIAN
4993 le = bo < 0;
4994#else
4995 le = bo <= 0;
4996#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02004997 encoding = le ? "utf-32-le" : "utf-32-be";
Victor Stinnere64322e2012-10-30 23:12:47 +01004998
Victor Stinner8f674cc2013-04-17 23:02:17 +02004999 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005000 writer.min_length = (e - q + 3) / 4;
5001 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005002 goto onError;
Victor Stinnere64322e2012-10-30 23:12:47 +01005003
Victor Stinnere64322e2012-10-30 23:12:47 +01005004 while (1) {
5005 Py_UCS4 ch = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005006 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005007
Victor Stinnere64322e2012-10-30 23:12:47 +01005008 if (e - q >= 4) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005009 enum PyUnicode_Kind kind = writer.kind;
5010 void *data = writer.data;
Victor Stinnere64322e2012-10-30 23:12:47 +01005011 const unsigned char *last = e - 4;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005012 Py_ssize_t pos = writer.pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005013 if (le) {
5014 do {
5015 ch = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5016 if (ch > maxch)
5017 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005018 if (kind != PyUnicode_1BYTE_KIND &&
5019 Py_UNICODE_IS_SURROGATE(ch))
5020 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005021 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005022 q += 4;
5023 } while (q <= last);
5024 }
5025 else {
5026 do {
5027 ch = (q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
5028 if (ch > maxch)
5029 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005030 if (kind != PyUnicode_1BYTE_KIND &&
5031 Py_UNICODE_IS_SURROGATE(ch))
5032 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005033 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005034 q += 4;
5035 } while (q <= last);
5036 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005037 writer.pos = pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005038 }
5039
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005040 if (Py_UNICODE_IS_SURROGATE(ch)) {
5041 errmsg = "codepoint in surrogate code point range(0xd800, 0xe000)";
5042 startinpos = ((const char *)q) - starts;
5043 endinpos = startinpos + 4;
5044 }
5045 else if (ch <= maxch) {
Victor Stinnere64322e2012-10-30 23:12:47 +01005046 if (q == e || consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005047 break;
Victor Stinnere64322e2012-10-30 23:12:47 +01005048 /* remaining bytes at the end? (size should be divisible by 4) */
Benjamin Peterson29060642009-01-31 22:14:21 +00005049 errmsg = "truncated data";
Victor Stinnere64322e2012-10-30 23:12:47 +01005050 startinpos = ((const char *)q) - starts;
5051 endinpos = ((const char *)e) - starts;
Benjamin Peterson29060642009-01-31 22:14:21 +00005052 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005053 else {
5054 if (ch < 0x110000) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005055 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnere64322e2012-10-30 23:12:47 +01005056 goto onError;
5057 q += 4;
5058 continue;
5059 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005060 errmsg = "codepoint not in range(0x110000)";
Victor Stinnere64322e2012-10-30 23:12:47 +01005061 startinpos = ((const char *)q) - starts;
5062 endinpos = startinpos + 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005063 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005064
5065 /* The remaining input chars are ignored if the callback
5066 chooses to skip the input */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005067 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005068 errors, &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005069 encoding, errmsg,
Benjamin Peterson29060642009-01-31 22:14:21 +00005070 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005071 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005072 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005073 }
5074
Walter Dörwald41980ca2007-08-16 21:55:45 +00005075 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005076 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005077
Walter Dörwald41980ca2007-08-16 21:55:45 +00005078 Py_XDECREF(errorHandler);
5079 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005080 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005081
Benjamin Peterson29060642009-01-31 22:14:21 +00005082 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005083 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005084 Py_XDECREF(errorHandler);
5085 Py_XDECREF(exc);
5086 return NULL;
5087}
5088
5089PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005090_PyUnicode_EncodeUTF32(PyObject *str,
5091 const char *errors,
5092 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005093{
Serhiy Storchaka30793282014-01-04 22:44:01 +02005094 int kind;
5095 void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005096 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005097 PyObject *v;
Serhiy Storchaka30793282014-01-04 22:44:01 +02005098 unsigned char *p;
5099 Py_ssize_t nsize, i;
5100 /* Offsets from p for storing byte pairs in the right order. */
Christian Heimes743e0cd2012-10-17 23:52:17 +02005101#if PY_LITTLE_ENDIAN
Serhiy Storchaka30793282014-01-04 22:44:01 +02005102 int iorder[] = {0, 1, 2, 3};
Walter Dörwald41980ca2007-08-16 21:55:45 +00005103#else
Serhiy Storchaka30793282014-01-04 22:44:01 +02005104 int iorder[] = {3, 2, 1, 0};
Walter Dörwald41980ca2007-08-16 21:55:45 +00005105#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005106 const char *encoding;
5107 PyObject *errorHandler = NULL;
5108 PyObject *exc = NULL;
5109 PyObject *rep = NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005110
Serhiy Storchaka30793282014-01-04 22:44:01 +02005111#define STORECHAR(CH) \
5112 do { \
5113 p[iorder[3]] = ((CH) >> 24) & 0xff; \
5114 p[iorder[2]] = ((CH) >> 16) & 0xff; \
5115 p[iorder[1]] = ((CH) >> 8) & 0xff; \
5116 p[iorder[0]] = (CH) & 0xff; \
5117 p += 4; \
5118 } while(0)
5119
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005120 if (!PyUnicode_Check(str)) {
5121 PyErr_BadArgument();
5122 return NULL;
5123 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005124 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005125 return NULL;
5126 kind = PyUnicode_KIND(str);
5127 data = PyUnicode_DATA(str);
5128 len = PyUnicode_GET_LENGTH(str);
5129
Serhiy Storchaka583a9392014-01-04 19:25:37 +02005130 nsize = len + (byteorder == 0);
Serhiy Storchaka30793282014-01-04 22:44:01 +02005131 if (nsize > PY_SSIZE_T_MAX / 4)
5132 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005133 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005134 if (v == NULL)
5135 return NULL;
5136
Serhiy Storchaka30793282014-01-04 22:44:01 +02005137 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005138 if (byteorder == 0)
Serhiy Storchaka30793282014-01-04 22:44:01 +02005139 STORECHAR(0xFEFF);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005140 if (len == 0)
Serhiy Storchaka30793282014-01-04 22:44:01 +02005141 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005142
Serhiy Storchaka30793282014-01-04 22:44:01 +02005143 if (byteorder == -1) {
5144 /* force LE */
5145 iorder[0] = 0;
5146 iorder[1] = 1;
5147 iorder[2] = 2;
5148 iorder[3] = 3;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005149 encoding = "utf-32-le";
Serhiy Storchaka30793282014-01-04 22:44:01 +02005150 }
5151 else if (byteorder == 1) {
5152 /* force BE */
5153 iorder[0] = 3;
5154 iorder[1] = 2;
5155 iorder[2] = 1;
5156 iorder[3] = 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005157 encoding = "utf-32-be";
Serhiy Storchaka30793282014-01-04 22:44:01 +02005158 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005159 else
5160 encoding = "utf-32";
5161
5162 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchaka30793282014-01-04 22:44:01 +02005163 for (i = 0; i < len; i++)
5164 STORECHAR(PyUnicode_READ(kind, data, i));
5165 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005166 }
5167
Serhiy Storchaka30793282014-01-04 22:44:01 +02005168 for (i = 0; i < len;) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005169 Py_ssize_t repsize, moreunits;
Serhiy Storchaka30793282014-01-04 22:44:01 +02005170 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
5171 i++;
5172 assert(ch <= MAX_UNICODE);
5173 if (!Py_UNICODE_IS_SURROGATE(ch)) {
5174 STORECHAR(ch);
5175 continue;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005176 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005177
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005178 rep = unicode_encode_call_errorhandler(
5179 errors, &errorHandler,
5180 encoding, "surrogates not allowed",
Serhiy Storchaka30793282014-01-04 22:44:01 +02005181 str, &exc, i-1, i, &i);
5182
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005183 if (!rep)
5184 goto error;
5185
5186 if (PyBytes_Check(rep)) {
5187 repsize = PyBytes_GET_SIZE(rep);
5188 if (repsize & 3) {
5189 raise_encode_exception(&exc, encoding,
Serhiy Storchaka30793282014-01-04 22:44:01 +02005190 str, i - 1, i,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005191 "surrogates not allowed");
5192 goto error;
5193 }
5194 moreunits = repsize / 4;
5195 }
5196 else {
5197 assert(PyUnicode_Check(rep));
5198 if (PyUnicode_READY(rep) < 0)
5199 goto error;
5200 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5201 if (!PyUnicode_IS_ASCII(rep)) {
5202 raise_encode_exception(&exc, encoding,
Serhiy Storchaka30793282014-01-04 22:44:01 +02005203 str, i - 1, i,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005204 "surrogates not allowed");
5205 goto error;
5206 }
5207 }
5208
5209 /* four bytes are reserved for each surrogate */
5210 if (moreunits > 1) {
Serhiy Storchaka30793282014-01-04 22:44:01 +02005211 Py_ssize_t outpos = p - (unsigned char*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005212 Py_ssize_t morebytes = 4 * (moreunits - 1);
5213 if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) {
5214 /* integer overflow */
5215 PyErr_NoMemory();
5216 goto error;
5217 }
5218 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0)
5219 goto error;
Serhiy Storchaka30793282014-01-04 22:44:01 +02005220 p = (unsigned char*) PyBytes_AS_STRING(v) + outpos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005221 }
5222
5223 if (PyBytes_Check(rep)) {
Serhiy Storchaka30793282014-01-04 22:44:01 +02005224 Py_MEMCPY(p, PyBytes_AS_STRING(rep), repsize);
5225 p += repsize;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005226 } else /* rep is unicode */ {
Serhiy Storchaka30793282014-01-04 22:44:01 +02005227 const Py_UCS1 *repdata;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005228 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
Serhiy Storchaka30793282014-01-04 22:44:01 +02005229 repdata = PyUnicode_1BYTE_DATA(rep);
5230 while (repsize--) {
5231 Py_UCS4 ch = *repdata++;
5232 STORECHAR(ch);
5233 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005234 }
5235
5236 Py_CLEAR(rep);
5237 }
5238
5239 /* Cut back to size actually needed. This is necessary for, for example,
5240 encoding of a string containing isolated surrogates and the 'ignore'
5241 handler is used. */
Serhiy Storchaka30793282014-01-04 22:44:01 +02005242 nsize = p - (unsigned char*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005243 if (nsize != PyBytes_GET_SIZE(v))
5244 _PyBytes_Resize(&v, nsize);
5245 Py_XDECREF(errorHandler);
5246 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005247 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005248 error:
5249 Py_XDECREF(rep);
5250 Py_XDECREF(errorHandler);
5251 Py_XDECREF(exc);
5252 Py_XDECREF(v);
5253 return NULL;
Serhiy Storchaka30793282014-01-04 22:44:01 +02005254#undef STORECHAR
Walter Dörwald41980ca2007-08-16 21:55:45 +00005255}
5256
Alexander Belopolsky40018472011-02-26 01:02:56 +00005257PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005258PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5259 Py_ssize_t size,
5260 const char *errors,
5261 int byteorder)
5262{
5263 PyObject *result;
5264 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5265 if (tmp == NULL)
5266 return NULL;
5267 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5268 Py_DECREF(tmp);
5269 return result;
5270}
5271
5272PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005273PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005274{
Victor Stinnerb960b342011-11-20 19:12:52 +01005275 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005276}
5277
Guido van Rossumd57fd912000-03-10 22:53:23 +00005278/* --- UTF-16 Codec ------------------------------------------------------- */
5279
Tim Peters772747b2001-08-09 22:21:55 +00005280PyObject *
5281PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005282 Py_ssize_t size,
5283 const char *errors,
5284 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005285{
Walter Dörwald69652032004-09-07 20:24:22 +00005286 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5287}
5288
5289PyObject *
5290PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005291 Py_ssize_t size,
5292 const char *errors,
5293 int *byteorder,
5294 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005295{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005296 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005297 Py_ssize_t startinpos;
5298 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005299 _PyUnicodeWriter writer;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005300 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005301 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005302 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005303 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005304 PyObject *errorHandler = NULL;
5305 PyObject *exc = NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005306 const char *encoding;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005307
Tim Peters772747b2001-08-09 22:21:55 +00005308 q = (unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005309 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005310
5311 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005312 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005313
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005314 /* Check for BOM marks (U+FEFF) in the input and adjust current
5315 byte order setting accordingly. In native mode, the leading BOM
5316 mark is skipped, in all other modes, it is copied to the output
5317 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005318 if (bo == 0 && size >= 2) {
5319 const Py_UCS4 bom = (q[1] << 8) | q[0];
5320 if (bom == 0xFEFF) {
5321 q += 2;
5322 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005323 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005324 else if (bom == 0xFFFE) {
5325 q += 2;
5326 bo = 1;
5327 }
5328 if (byteorder)
5329 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005330 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005331
Antoine Pitrou63065d72012-05-15 23:48:04 +02005332 if (q == e) {
5333 if (consumed)
5334 *consumed = size;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005335 _Py_RETURN_UNICODE_EMPTY();
Tim Peters772747b2001-08-09 22:21:55 +00005336 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005337
Christian Heimes743e0cd2012-10-17 23:52:17 +02005338#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02005339 native_ordering = bo <= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005340 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
Antoine Pitrouab868312009-01-10 15:40:25 +00005341#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02005342 native_ordering = bo >= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005343 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
Antoine Pitrouab868312009-01-10 15:40:25 +00005344#endif
Tim Peters772747b2001-08-09 22:21:55 +00005345
Antoine Pitrou63065d72012-05-15 23:48:04 +02005346 /* Note: size will always be longer than the resulting Unicode
5347 character count */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005348 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005349 writer.min_length = (e - q + 1) / 2;
5350 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005351 goto onError;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005352
Antoine Pitrou63065d72012-05-15 23:48:04 +02005353 while (1) {
5354 Py_UCS4 ch = 0;
5355 if (e - q >= 2) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005356 int kind = writer.kind;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005357 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005358 if (PyUnicode_IS_ASCII(writer.buffer))
Antoine Pitrou63065d72012-05-15 23:48:04 +02005359 ch = asciilib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005360 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005361 native_ordering);
5362 else
5363 ch = ucs1lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005364 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005365 native_ordering);
5366 } else if (kind == PyUnicode_2BYTE_KIND) {
5367 ch = ucs2lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005368 (Py_UCS2*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005369 native_ordering);
5370 } else {
5371 assert(kind == PyUnicode_4BYTE_KIND);
5372 ch = ucs4lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005373 (Py_UCS4*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005374 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00005375 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005376 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005377
Antoine Pitrou63065d72012-05-15 23:48:04 +02005378 switch (ch)
5379 {
5380 case 0:
5381 /* remaining byte at the end? (size should be even) */
5382 if (q == e || consumed)
5383 goto End;
5384 errmsg = "truncated data";
5385 startinpos = ((const char *)q) - starts;
5386 endinpos = ((const char *)e) - starts;
5387 break;
5388 /* The remaining input chars are ignored if the callback
5389 chooses to skip the input */
5390 case 1:
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005391 q -= 2;
5392 if (consumed)
Serhiy Storchakaae3b32a2013-01-08 23:40:52 +02005393 goto End;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005394 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005395 startinpos = ((const char *)q) - starts;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005396 endinpos = ((const char *)e) - starts;
5397 break;
5398 case 2:
5399 errmsg = "illegal encoding";
5400 startinpos = ((const char *)q) - 2 - starts;
5401 endinpos = startinpos + 2;
5402 break;
5403 case 3:
5404 errmsg = "illegal UTF-16 surrogate";
5405 startinpos = ((const char *)q) - 4 - starts;
5406 endinpos = startinpos + 2;
5407 break;
5408 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005409 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005410 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005411 continue;
5412 }
5413
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005414 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouab868312009-01-10 15:40:25 +00005415 errors,
5416 &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005417 encoding, errmsg,
Antoine Pitrouab868312009-01-10 15:40:25 +00005418 &starts,
5419 (const char **)&e,
5420 &startinpos,
5421 &endinpos,
5422 &exc,
5423 (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005424 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005425 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005426 }
5427
Antoine Pitrou63065d72012-05-15 23:48:04 +02005428End:
Walter Dörwald69652032004-09-07 20:24:22 +00005429 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005430 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005431
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005432 Py_XDECREF(errorHandler);
5433 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005434 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005435
Benjamin Peterson29060642009-01-31 22:14:21 +00005436 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005437 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005438 Py_XDECREF(errorHandler);
5439 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005440 return NULL;
5441}
5442
Tim Peters772747b2001-08-09 22:21:55 +00005443PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005444_PyUnicode_EncodeUTF16(PyObject *str,
5445 const char *errors,
5446 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005447{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005448 enum PyUnicode_Kind kind;
5449 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005450 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005451 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005452 unsigned short *out;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005453 Py_ssize_t pairs;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005454#if PY_BIG_ENDIAN
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005455 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005456#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005457 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005458#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005459 const char *encoding;
5460 Py_ssize_t nsize, pos;
5461 PyObject *errorHandler = NULL;
5462 PyObject *exc = NULL;
5463 PyObject *rep = NULL;
Tim Peters772747b2001-08-09 22:21:55 +00005464
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005465 if (!PyUnicode_Check(str)) {
5466 PyErr_BadArgument();
5467 return NULL;
5468 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005469 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005470 return NULL;
5471 kind = PyUnicode_KIND(str);
5472 data = PyUnicode_DATA(str);
5473 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005474
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005475 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005476 if (kind == PyUnicode_4BYTE_KIND) {
5477 const Py_UCS4 *in = (const Py_UCS4 *)data;
5478 const Py_UCS4 *end = in + len;
5479 while (in < end)
5480 if (*in++ >= 0x10000)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005481 pairs++;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005482 }
5483 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005484 return PyErr_NoMemory();
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005485 nsize = len + pairs + (byteorder == 0);
5486 v = PyBytes_FromStringAndSize(NULL, nsize * 2);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005487 if (v == NULL)
5488 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005489
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005490 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02005491 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005492 out = (unsigned short *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005493 if (byteorder == 0)
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005494 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005495 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005496 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005497
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005498 if (kind == PyUnicode_1BYTE_KIND) {
5499 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5500 goto done;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005501 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005502
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005503 if (byteorder < 0)
5504 encoding = "utf-16-le";
5505 else if (byteorder > 0)
5506 encoding = "utf-16-be";
5507 else
5508 encoding = "utf-16";
5509
5510 pos = 0;
5511 while (pos < len) {
5512 Py_ssize_t repsize, moreunits;
5513
5514 if (kind == PyUnicode_2BYTE_KIND) {
5515 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
5516 &out, native_ordering);
5517 }
5518 else {
5519 assert(kind == PyUnicode_4BYTE_KIND);
5520 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
5521 &out, native_ordering);
5522 }
5523 if (pos == len)
5524 break;
5525
5526 rep = unicode_encode_call_errorhandler(
5527 errors, &errorHandler,
5528 encoding, "surrogates not allowed",
5529 str, &exc, pos, pos + 1, &pos);
5530 if (!rep)
5531 goto error;
5532
5533 if (PyBytes_Check(rep)) {
5534 repsize = PyBytes_GET_SIZE(rep);
5535 if (repsize & 1) {
5536 raise_encode_exception(&exc, encoding,
5537 str, pos - 1, pos,
5538 "surrogates not allowed");
5539 goto error;
5540 }
5541 moreunits = repsize / 2;
5542 }
5543 else {
5544 assert(PyUnicode_Check(rep));
5545 if (PyUnicode_READY(rep) < 0)
5546 goto error;
5547 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5548 if (!PyUnicode_IS_ASCII(rep)) {
5549 raise_encode_exception(&exc, encoding,
5550 str, pos - 1, pos,
5551 "surrogates not allowed");
5552 goto error;
5553 }
5554 }
5555
5556 /* two bytes are reserved for each surrogate */
5557 if (moreunits > 1) {
5558 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
5559 Py_ssize_t morebytes = 2 * (moreunits - 1);
5560 if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) {
5561 /* integer overflow */
5562 PyErr_NoMemory();
5563 goto error;
5564 }
5565 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0)
5566 goto error;
5567 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
5568 }
5569
5570 if (PyBytes_Check(rep)) {
5571 Py_MEMCPY(out, PyBytes_AS_STRING(rep), repsize);
5572 out += moreunits;
5573 } else /* rep is unicode */ {
5574 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
5575 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5576 &out, native_ordering);
5577 }
5578
5579 Py_CLEAR(rep);
5580 }
5581
5582 /* Cut back to size actually needed. This is necessary for, for example,
5583 encoding of a string containing isolated surrogates and the 'ignore' handler
5584 is used. */
5585 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
5586 if (nsize != PyBytes_GET_SIZE(v))
5587 _PyBytes_Resize(&v, nsize);
5588 Py_XDECREF(errorHandler);
5589 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00005590 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005591 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005592 error:
5593 Py_XDECREF(rep);
5594 Py_XDECREF(errorHandler);
5595 Py_XDECREF(exc);
5596 Py_XDECREF(v);
5597 return NULL;
5598#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005599}
5600
Alexander Belopolsky40018472011-02-26 01:02:56 +00005601PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005602PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5603 Py_ssize_t size,
5604 const char *errors,
5605 int byteorder)
5606{
5607 PyObject *result;
5608 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5609 if (tmp == NULL)
5610 return NULL;
5611 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5612 Py_DECREF(tmp);
5613 return result;
5614}
5615
5616PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005617PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005618{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005619 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005620}
5621
5622/* --- Unicode Escape Codec ----------------------------------------------- */
5623
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005624/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5625 if all the escapes in the string make it still a valid ASCII string.
5626 Returns -1 if any escapes were found which cause the string to
5627 pop out of ASCII range. Otherwise returns the length of the
5628 required buffer to hold the string.
5629 */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02005630static Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005631length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5632{
5633 const unsigned char *p = (const unsigned char *)s;
5634 const unsigned char *end = p + size;
5635 Py_ssize_t length = 0;
5636
5637 if (size < 0)
5638 return -1;
5639
5640 for (; p < end; ++p) {
5641 if (*p > 127) {
5642 /* Non-ASCII */
5643 return -1;
5644 }
5645 else if (*p != '\\') {
5646 /* Normal character */
5647 ++length;
5648 }
5649 else {
5650 /* Backslash-escape, check next char */
5651 ++p;
5652 /* Escape sequence reaches till end of string or
5653 non-ASCII follow-up. */
5654 if (p >= end || *p > 127)
5655 return -1;
5656 switch (*p) {
5657 case '\n':
5658 /* backslash + \n result in zero characters */
5659 break;
5660 case '\\': case '\'': case '\"':
5661 case 'b': case 'f': case 't':
5662 case 'n': case 'r': case 'v': case 'a':
5663 ++length;
5664 break;
5665 case '0': case '1': case '2': case '3':
5666 case '4': case '5': case '6': case '7':
5667 case 'x': case 'u': case 'U': case 'N':
5668 /* these do not guarantee ASCII characters */
5669 return -1;
5670 default:
5671 /* count the backslash + the other character */
5672 length += 2;
5673 }
5674 }
5675 }
5676 return length;
5677}
5678
Fredrik Lundh06d12682001-01-24 07:59:11 +00005679static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005680
Alexander Belopolsky40018472011-02-26 01:02:56 +00005681PyObject *
5682PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005683 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005684 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005685{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005686 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005687 Py_ssize_t startinpos;
5688 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005689 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005690 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005691 char* message;
5692 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005693 PyObject *errorHandler = NULL;
5694 PyObject *exc = NULL;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005695 Py_ssize_t len;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005696
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005697 len = length_of_escaped_ascii_string(s, size);
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005698 if (len == 0)
5699 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005700
5701 /* After length_of_escaped_ascii_string() there are two alternatives,
5702 either the string is pure ASCII with named escapes like \n, etc.
5703 and we determined it's exact size (common case)
5704 or it contains \x, \u, ... escape sequences. then we create a
5705 legacy wchar string and resize it at the end of this function. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005706 _PyUnicodeWriter_Init(&writer);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005707 if (len > 0) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02005708 writer.min_length = len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005709 }
5710 else {
5711 /* Escaped strings will always be longer than the resulting
5712 Unicode string, so we start with size here and then reduce the
5713 length after conversion to the true value.
5714 (but if the error callback returns a long replacement string
5715 we'll have to allocate more space) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005716 writer.min_length = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005717 }
5718
Guido van Rossumd57fd912000-03-10 22:53:23 +00005719 if (size == 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005720 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005721 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005722
Guido van Rossumd57fd912000-03-10 22:53:23 +00005723 while (s < end) {
5724 unsigned char c;
Victor Stinner24729f32011-11-10 20:31:37 +01005725 Py_UCS4 x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005726 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005727
5728 /* Non-escape characters are interpreted as Unicode ordinals */
5729 if (*s != '\\') {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005730 x = (unsigned char)*s;
5731 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005732 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005733 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005734 continue;
5735 }
5736
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005737 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005738 /* \ - Escapes */
5739 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005740 c = *s++;
5741 if (s > end)
5742 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005743
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005744 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005745
Benjamin Peterson29060642009-01-31 22:14:21 +00005746 /* \x escapes */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005747#define WRITECHAR(ch) \
5748 do { \
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005749 if (_PyUnicodeWriter_WriteCharInline(&writer, (ch)) < 0) \
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005750 goto onError; \
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005751 } while(0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005752
Guido van Rossumd57fd912000-03-10 22:53:23 +00005753 case '\n': break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005754 case '\\': WRITECHAR('\\'); break;
5755 case '\'': WRITECHAR('\''); break;
5756 case '\"': WRITECHAR('\"'); break;
5757 case 'b': WRITECHAR('\b'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005758 /* FF */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005759 case 'f': WRITECHAR('\014'); break;
5760 case 't': WRITECHAR('\t'); break;
5761 case 'n': WRITECHAR('\n'); break;
5762 case 'r': WRITECHAR('\r'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005763 /* VT */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005764 case 'v': WRITECHAR('\013'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005765 /* BEL, not classic C */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005766 case 'a': WRITECHAR('\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005767
Benjamin Peterson29060642009-01-31 22:14:21 +00005768 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005769 case '0': case '1': case '2': case '3':
5770 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005771 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005772 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005773 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005774 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005775 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005776 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005777 WRITECHAR(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005778 break;
5779
Benjamin Peterson29060642009-01-31 22:14:21 +00005780 /* hex escapes */
5781 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005782 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005783 digits = 2;
5784 message = "truncated \\xXX escape";
5785 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005786
Benjamin Peterson29060642009-01-31 22:14:21 +00005787 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005788 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005789 digits = 4;
5790 message = "truncated \\uXXXX escape";
5791 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005792
Benjamin Peterson29060642009-01-31 22:14:21 +00005793 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005794 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005795 digits = 8;
5796 message = "truncated \\UXXXXXXXX escape";
5797 hexescape:
5798 chr = 0;
Serhiy Storchakad6793772013-01-29 10:20:44 +02005799 if (end - s < digits) {
5800 /* count only hex digits */
5801 for (; s < end; ++s) {
5802 c = (unsigned char)*s;
5803 if (!Py_ISXDIGIT(c))
5804 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005805 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02005806 goto error;
5807 }
5808 for (; digits--; ++s) {
5809 c = (unsigned char)*s;
5810 if (!Py_ISXDIGIT(c))
5811 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005812 chr = (chr<<4) & ~0xF;
5813 if (c >= '0' && c <= '9')
5814 chr += c - '0';
5815 else if (c >= 'a' && c <= 'f')
5816 chr += 10 + c - 'a';
5817 else
5818 chr += 10 + c - 'A';
5819 }
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005820 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005821 /* _decoding_error will have already written into the
5822 target buffer. */
5823 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005824 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005825 /* when we get here, chr is a 32-bit unicode character */
Serhiy Storchaka24193de2013-01-29 10:28:07 +02005826 message = "illegal Unicode character";
5827 if (chr > MAX_UNICODE)
Serhiy Storchakad6793772013-01-29 10:20:44 +02005828 goto error;
Serhiy Storchaka24193de2013-01-29 10:28:07 +02005829 WRITECHAR(chr);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005830 break;
5831
Benjamin Peterson29060642009-01-31 22:14:21 +00005832 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005833 case 'N':
5834 message = "malformed \\N character escape";
5835 if (ucnhash_CAPI == NULL) {
5836 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005837 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5838 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005839 if (ucnhash_CAPI == NULL)
5840 goto ucnhashError;
5841 }
5842 if (*s == '{') {
5843 const char *start = s+1;
5844 /* look for the closing brace */
5845 while (*s != '}' && s < end)
5846 s++;
5847 if (s > start && s < end && *s == '}') {
5848 /* found a name. look it up in the unicode database */
5849 message = "unknown Unicode character name";
5850 s++;
Serhiy Storchaka4f5f0e52013-01-21 11:38:00 +02005851 if (s - start - 1 <= INT_MAX &&
Serhiy Storchakac35f3a92013-01-21 11:42:57 +02005852 ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
Ezio Melotti931b8aa2011-10-21 21:57:36 +03005853 &chr, 0))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005854 goto store;
5855 }
5856 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02005857 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005858
5859 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005860 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005861 message = "\\ at end of string";
5862 s--;
Serhiy Storchakad6793772013-01-29 10:20:44 +02005863 goto error;
Walter Dörwald8c077222002-03-25 11:16:18 +00005864 }
5865 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005866 WRITECHAR('\\');
Serhiy Storchaka73e38802013-01-25 23:52:21 +02005867 WRITECHAR((unsigned char)s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005868 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005869 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005870 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02005871 continue;
5872
5873 error:
5874 endinpos = s-starts;
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02005875 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchakad6793772013-01-29 10:20:44 +02005876 errors, &errorHandler,
5877 "unicodeescape", message,
5878 &starts, &end, &startinpos, &endinpos, &exc, &s,
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02005879 &writer))
Serhiy Storchakad6793772013-01-29 10:20:44 +02005880 goto onError;
5881 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005882 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005883#undef WRITECHAR
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005884
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005885 Py_XDECREF(errorHandler);
5886 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005887 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald8c077222002-03-25 11:16:18 +00005888
Benjamin Peterson29060642009-01-31 22:14:21 +00005889 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005890 PyErr_SetString(
5891 PyExc_UnicodeError,
5892 "\\N escapes not supported (can't load unicodedata module)"
5893 );
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005894 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005895 Py_XDECREF(errorHandler);
5896 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005897 return NULL;
5898
Benjamin Peterson29060642009-01-31 22:14:21 +00005899 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005900 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005901 Py_XDECREF(errorHandler);
5902 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005903 return NULL;
5904}
5905
5906/* Return a Unicode-Escape string version of the Unicode object.
5907
5908 If quotes is true, the string is enclosed in u"" or u'' quotes as
5909 appropriate.
5910
5911*/
5912
Alexander Belopolsky40018472011-02-26 01:02:56 +00005913PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005914PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005915{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005916 Py_ssize_t i, len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005917 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005918 char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005919 int kind;
5920 void *data;
5921 Py_ssize_t expandsize = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005922
Ezio Melottie7f90372012-10-05 03:33:31 +03005923 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00005924 escape.
5925
Ezio Melottie7f90372012-10-05 03:33:31 +03005926 For UCS1 strings it's '\xxx', 4 bytes per source character.
5927 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
5928 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00005929 */
5930
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005931 if (!PyUnicode_Check(unicode)) {
5932 PyErr_BadArgument();
5933 return NULL;
5934 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005935 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005936 return NULL;
5937 len = PyUnicode_GET_LENGTH(unicode);
5938 kind = PyUnicode_KIND(unicode);
5939 data = PyUnicode_DATA(unicode);
Benjamin Petersonead6b532011-12-20 17:23:42 -06005940 switch (kind) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005941 case PyUnicode_1BYTE_KIND: expandsize = 4; break;
5942 case PyUnicode_2BYTE_KIND: expandsize = 6; break;
5943 case PyUnicode_4BYTE_KIND: expandsize = 10; break;
5944 }
5945
5946 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005947 return PyBytes_FromStringAndSize(NULL, 0);
5948
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005949 if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005950 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005951
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005952 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005953 2
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005954 + expandsize*len
Benjamin Peterson29060642009-01-31 22:14:21 +00005955 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005956 if (repr == NULL)
5957 return NULL;
5958
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005959 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005960
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005961 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01005962 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005963
Walter Dörwald79e913e2007-05-12 11:08:06 +00005964 /* Escape backslashes */
5965 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005966 *p++ = '\\';
5967 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005968 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005969 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005970
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005971 /* Map 21-bit characters to '\U00xxxxxx' */
5972 else if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01005973 assert(ch <= MAX_UNICODE);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005974 *p++ = '\\';
5975 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005976 *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
5977 *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F];
5978 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
5979 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
5980 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
5981 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
5982 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
5983 *p++ = Py_hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00005984 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005985 }
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005986
Guido van Rossumd57fd912000-03-10 22:53:23 +00005987 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005988 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005989 *p++ = '\\';
5990 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005991 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
5992 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
5993 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5994 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005995 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005996
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005997 /* Map special whitespace to '\t', \n', '\r' */
5998 else if (ch == '\t') {
5999 *p++ = '\\';
6000 *p++ = 't';
6001 }
6002 else if (ch == '\n') {
6003 *p++ = '\\';
6004 *p++ = 'n';
6005 }
6006 else if (ch == '\r') {
6007 *p++ = '\\';
6008 *p++ = 'r';
6009 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006010
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006011 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00006012 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006013 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006014 *p++ = 'x';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006015 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6016 *p++ = Py_hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00006017 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006018
Guido van Rossumd57fd912000-03-10 22:53:23 +00006019 /* Copy everything else as-is */
6020 else
6021 *p++ = (char) ch;
6022 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006023
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006024 assert(p - PyBytes_AS_STRING(repr) > 0);
6025 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
6026 return NULL;
6027 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006028}
6029
Alexander Belopolsky40018472011-02-26 01:02:56 +00006030PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006031PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6032 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006033{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006034 PyObject *result;
6035 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6036 if (tmp == NULL)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006037 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006038 result = PyUnicode_AsUnicodeEscapeString(tmp);
6039 Py_DECREF(tmp);
6040 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006041}
6042
6043/* --- Raw Unicode Escape Codec ------------------------------------------- */
6044
Alexander Belopolsky40018472011-02-26 01:02:56 +00006045PyObject *
6046PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006047 Py_ssize_t size,
6048 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006049{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006050 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006051 Py_ssize_t startinpos;
6052 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006053 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006054 const char *end;
6055 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006056 PyObject *errorHandler = NULL;
6057 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006058
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006059 if (size == 0)
6060 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006061
Guido van Rossumd57fd912000-03-10 22:53:23 +00006062 /* Escaped strings will always be longer than the resulting
6063 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006064 length after conversion to the true value. (But decoding error
6065 handler might have to resize the string) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006066 _PyUnicodeWriter_Init(&writer);
6067 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006068
Guido van Rossumd57fd912000-03-10 22:53:23 +00006069 end = s + size;
6070 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006071 unsigned char c;
6072 Py_UCS4 x;
6073 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006074 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006075
Benjamin Peterson29060642009-01-31 22:14:21 +00006076 /* Non-escape characters are interpreted as Unicode ordinals */
6077 if (*s != '\\') {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006078 x = (unsigned char)*s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006079 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006080 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006081 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006082 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006083 startinpos = s-starts;
6084
6085 /* \u-escapes are only interpreted iff the number of leading
6086 backslashes if odd */
6087 bs = s;
6088 for (;s < end;) {
6089 if (*s != '\\')
6090 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006091 x = (unsigned char)*s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006092 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006093 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006094 }
6095 if (((s - bs) & 1) == 0 ||
6096 s >= end ||
6097 (*s != 'u' && *s != 'U')) {
6098 continue;
6099 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006100 writer.pos--;
Benjamin Peterson29060642009-01-31 22:14:21 +00006101 count = *s=='u' ? 4 : 8;
6102 s++;
6103
6104 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Benjamin Peterson29060642009-01-31 22:14:21 +00006105 for (x = 0, i = 0; i < count; ++i, ++s) {
6106 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00006107 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006108 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006109 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00006110 errors, &errorHandler,
6111 "rawunicodeescape", "truncated \\uXXXX",
6112 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006113 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006114 goto onError;
6115 goto nextByte;
6116 }
6117 x = (x<<4) & ~0xF;
6118 if (c >= '0' && c <= '9')
6119 x += c - '0';
6120 else if (c >= 'a' && c <= 'f')
6121 x += 10 + c - 'a';
6122 else
6123 x += 10 + c - 'A';
6124 }
Victor Stinner8faf8212011-12-08 22:14:11 +01006125 if (x <= MAX_UNICODE) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006126 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006127 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006128 }
6129 else {
Christian Heimesfe337bf2008-03-23 21:54:12 +00006130 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006131 if (unicode_decode_call_errorhandler_writer(
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006132 errors, &errorHandler,
6133 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00006134 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006135 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006136 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006137 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006138 nextByte:
6139 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006140 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006141 Py_XDECREF(errorHandler);
6142 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006143 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006144
Benjamin Peterson29060642009-01-31 22:14:21 +00006145 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006146 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006147 Py_XDECREF(errorHandler);
6148 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006149 return NULL;
6150}
6151
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006152
Alexander Belopolsky40018472011-02-26 01:02:56 +00006153PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006154PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006155{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006156 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006157 char *p;
6158 char *q;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006159 Py_ssize_t expandsize, pos;
6160 int kind;
6161 void *data;
6162 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006163
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006164 if (!PyUnicode_Check(unicode)) {
6165 PyErr_BadArgument();
6166 return NULL;
6167 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05006168 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006169 return NULL;
6170 kind = PyUnicode_KIND(unicode);
6171 data = PyUnicode_DATA(unicode);
6172 len = PyUnicode_GET_LENGTH(unicode);
Benjamin Peterson1518e872011-11-23 10:44:52 -06006173 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6174 bytes, and 1 byte characters 4. */
6175 expandsize = kind * 2 + 2;
Victor Stinner0e368262011-11-10 20:12:49 +01006176
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006177 if (len > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006178 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00006179
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006180 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006181 if (repr == NULL)
6182 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006183 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006184 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006185
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006186 p = q = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006187 for (pos = 0; pos < len; pos++) {
6188 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006189 /* Map 32-bit characters to '\Uxxxxxxxx' */
6190 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01006191 assert(ch <= MAX_UNICODE);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006192 *p++ = '\\';
6193 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006194 *p++ = Py_hexdigits[(ch >> 28) & 0xf];
6195 *p++ = Py_hexdigits[(ch >> 24) & 0xf];
6196 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6197 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6198 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6199 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6200 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6201 *p++ = Py_hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00006202 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006203 /* Map 16-bit characters to '\uxxxx' */
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006204 else if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006205 *p++ = '\\';
6206 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006207 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6208 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6209 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6210 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006211 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006212 /* Copy everything else as-is */
6213 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00006214 *p++ = (char) ch;
6215 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006216
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006217 assert(p > q);
6218 if (_PyBytes_Resize(&repr, p - q) < 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006219 return NULL;
6220 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006221}
6222
Alexander Belopolsky40018472011-02-26 01:02:56 +00006223PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006224PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6225 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006226{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006227 PyObject *result;
6228 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6229 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006230 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006231 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6232 Py_DECREF(tmp);
6233 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006234}
6235
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006236/* --- Unicode Internal Codec ------------------------------------------- */
6237
Alexander Belopolsky40018472011-02-26 01:02:56 +00006238PyObject *
6239_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006240 Py_ssize_t size,
6241 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006242{
6243 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006244 Py_ssize_t startinpos;
6245 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006246 _PyUnicodeWriter writer;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006247 const char *end;
6248 const char *reason;
6249 PyObject *errorHandler = NULL;
6250 PyObject *exc = NULL;
6251
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006252 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02006253 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006254 1))
6255 return NULL;
6256
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006257 if (size == 0)
6258 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006259
Victor Stinner8f674cc2013-04-17 23:02:17 +02006260 _PyUnicodeWriter_Init(&writer);
6261 if (size / Py_UNICODE_SIZE > PY_SSIZE_T_MAX - 1) {
6262 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00006263 goto onError;
Victor Stinner8f674cc2013-04-17 23:02:17 +02006264 }
6265 writer.min_length = (size + (Py_UNICODE_SIZE - 1)) / Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006266
Victor Stinner8f674cc2013-04-17 23:02:17 +02006267 end = s + size;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006268 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006269 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006270 Py_UCS4 ch;
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006271 if (end - s < Py_UNICODE_SIZE) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006272 endinpos = end-starts;
6273 reason = "truncated input";
6274 goto error;
6275 }
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006276 /* We copy the raw representation one byte at a time because the
6277 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006278 ((char *) &uch)[0] = s[0];
6279 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006280#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006281 ((char *) &uch)[2] = s[2];
6282 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006283#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006284 ch = uch;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006285#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006286 /* We have to sanity check the raw data, otherwise doom looms for
6287 some malformed UCS-4 data. */
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006288 if (ch > 0x10ffff) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006289 endinpos = s - starts + Py_UNICODE_SIZE;
6290 reason = "illegal code point (> 0x10FFFF)";
6291 goto error;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006292 }
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006293#endif
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006294 s += Py_UNICODE_SIZE;
6295#ifndef Py_UNICODE_WIDE
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006296 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && end - s >= Py_UNICODE_SIZE)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006297 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006298 Py_UNICODE uch2;
6299 ((char *) &uch2)[0] = s[0];
6300 ((char *) &uch2)[1] = s[1];
Victor Stinner551ac952011-11-29 22:58:13 +01006301 if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006302 {
Victor Stinner551ac952011-11-29 22:58:13 +01006303 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006304 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006305 }
6306 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006307#endif
6308
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006309 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006310 goto onError;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006311 continue;
6312
6313 error:
6314 startinpos = s - starts;
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006315 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006316 errors, &errorHandler,
6317 "unicode_internal", reason,
6318 &starts, &end, &startinpos, &endinpos, &exc, &s,
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006319 &writer))
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006320 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006321 }
6322
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006323 Py_XDECREF(errorHandler);
6324 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006325 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006326
Benjamin Peterson29060642009-01-31 22:14:21 +00006327 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006328 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006329 Py_XDECREF(errorHandler);
6330 Py_XDECREF(exc);
6331 return NULL;
6332}
6333
Guido van Rossumd57fd912000-03-10 22:53:23 +00006334/* --- Latin-1 Codec ------------------------------------------------------ */
6335
Alexander Belopolsky40018472011-02-26 01:02:56 +00006336PyObject *
6337PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006338 Py_ssize_t size,
6339 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006340{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006341 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006342 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006343}
6344
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006345/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006346static void
6347make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006348 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006349 PyObject *unicode,
6350 Py_ssize_t startpos, Py_ssize_t endpos,
6351 const char *reason)
6352{
6353 if (*exceptionObject == NULL) {
6354 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006355 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006356 encoding, unicode, startpos, endpos, reason);
6357 }
6358 else {
6359 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6360 goto onError;
6361 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6362 goto onError;
6363 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6364 goto onError;
6365 return;
6366 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02006367 Py_CLEAR(*exceptionObject);
Martin v. Löwis9e816682011-11-02 12:45:42 +01006368 }
6369}
6370
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006371/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006372static void
6373raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006374 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006375 PyObject *unicode,
6376 Py_ssize_t startpos, Py_ssize_t endpos,
6377 const char *reason)
6378{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006379 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006380 encoding, unicode, startpos, endpos, reason);
6381 if (*exceptionObject != NULL)
6382 PyCodec_StrictErrors(*exceptionObject);
6383}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006384
6385/* error handling callback helper:
6386 build arguments, call the callback and check the arguments,
6387 put the result into newpos and return the replacement string, which
6388 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006389static PyObject *
6390unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006391 PyObject **errorHandler,
6392 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006393 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006394 Py_ssize_t startpos, Py_ssize_t endpos,
6395 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006396{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006397 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006398 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006399 PyObject *restuple;
6400 PyObject *resunicode;
6401
6402 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006403 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006404 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006405 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006406 }
6407
Benjamin Petersonbac79492012-01-14 13:34:47 -05006408 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006409 return NULL;
6410 len = PyUnicode_GET_LENGTH(unicode);
6411
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006412 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006413 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006414 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006415 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006416
6417 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006418 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006419 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006420 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006421 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006422 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006423 Py_DECREF(restuple);
6424 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006425 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006426 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006427 &resunicode, newpos)) {
6428 Py_DECREF(restuple);
6429 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006430 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006431 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6432 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6433 Py_DECREF(restuple);
6434 return NULL;
6435 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006436 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006437 *newpos = len + *newpos;
6438 if (*newpos<0 || *newpos>len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006439 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6440 Py_DECREF(restuple);
6441 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006442 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006443 Py_INCREF(resunicode);
6444 Py_DECREF(restuple);
6445 return resunicode;
6446}
6447
Alexander Belopolsky40018472011-02-26 01:02:56 +00006448static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006449unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006450 const char *errors,
Victor Stinnerfcd96532011-11-04 00:28:50 +01006451 unsigned int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006452{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006453 /* input state */
6454 Py_ssize_t pos=0, size;
6455 int kind;
6456 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006457 /* output object */
6458 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006459 /* pointer into the output */
6460 char *str;
6461 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006462 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006463 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6464 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006465 PyObject *errorHandler = NULL;
6466 PyObject *exc = NULL;
6467 /* the following variable is used for caching string comparisons
6468 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6469 int known_errorHandler = -1;
6470
Benjamin Petersonbac79492012-01-14 13:34:47 -05006471 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006472 return NULL;
6473 size = PyUnicode_GET_LENGTH(unicode);
6474 kind = PyUnicode_KIND(unicode);
6475 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006476 /* allocate enough for a simple encoding without
6477 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006478 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006479 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006480 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006481 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006482 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006483 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006484 ressize = size;
6485
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006486 while (pos < size) {
6487 Py_UCS4 c = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006488
Benjamin Peterson29060642009-01-31 22:14:21 +00006489 /* can we encode this? */
6490 if (c<limit) {
6491 /* no overflow check, because we know that the space is enough */
6492 *str++ = (char)c;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006493 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006494 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006495 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006496 Py_ssize_t requiredsize;
6497 PyObject *repunicode;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006498 Py_ssize_t repsize, newpos, respos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006499 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006500 Py_ssize_t collstart = pos;
6501 Py_ssize_t collend = pos;
Benjamin Peterson29060642009-01-31 22:14:21 +00006502 /* find all unecodable characters */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006503 while ((collend < size) && (PyUnicode_READ(kind, data, collend)>=limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006504 ++collend;
6505 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6506 if (known_errorHandler==-1) {
6507 if ((errors==NULL) || (!strcmp(errors, "strict")))
6508 known_errorHandler = 1;
6509 else if (!strcmp(errors, "replace"))
6510 known_errorHandler = 2;
6511 else if (!strcmp(errors, "ignore"))
6512 known_errorHandler = 3;
6513 else if (!strcmp(errors, "xmlcharrefreplace"))
6514 known_errorHandler = 4;
6515 else
6516 known_errorHandler = 0;
6517 }
6518 switch (known_errorHandler) {
6519 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006520 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006521 goto onError;
6522 case 2: /* replace */
6523 while (collstart++<collend)
6524 *str++ = '?'; /* fall through */
6525 case 3: /* ignore */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006526 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006527 break;
6528 case 4: /* xmlcharrefreplace */
6529 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006530 /* determine replacement size */
6531 for (i = collstart, repsize = 0; i < collend; ++i) {
6532 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6533 if (ch < 10)
Benjamin Peterson29060642009-01-31 22:14:21 +00006534 repsize += 2+1+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006535 else if (ch < 100)
Benjamin Peterson29060642009-01-31 22:14:21 +00006536 repsize += 2+2+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006537 else if (ch < 1000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006538 repsize += 2+3+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006539 else if (ch < 10000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006540 repsize += 2+4+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006541 else if (ch < 100000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006542 repsize += 2+5+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006543 else if (ch < 1000000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006544 repsize += 2+6+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006545 else {
Victor Stinner8faf8212011-12-08 22:14:11 +01006546 assert(ch <= MAX_UNICODE);
Benjamin Peterson29060642009-01-31 22:14:21 +00006547 repsize += 2+7+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006548 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006549 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006550 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006551 if (requiredsize > ressize) {
6552 if (requiredsize<2*ressize)
6553 requiredsize = 2*ressize;
6554 if (_PyBytes_Resize(&res, requiredsize))
6555 goto onError;
6556 str = PyBytes_AS_STRING(res) + respos;
6557 ressize = requiredsize;
6558 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006559 /* generate replacement */
6560 for (i = collstart; i < collend; ++i) {
6561 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
Benjamin Peterson29060642009-01-31 22:14:21 +00006562 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006563 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006564 break;
6565 default:
6566 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006567 encoding, reason, unicode, &exc,
6568 collstart, collend, &newpos);
6569 if (repunicode == NULL || (PyUnicode_Check(repunicode) &&
Benjamin Petersonbac79492012-01-14 13:34:47 -05006570 PyUnicode_READY(repunicode) == -1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006571 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006572 if (PyBytes_Check(repunicode)) {
6573 /* Directly copy bytes result to output. */
6574 repsize = PyBytes_Size(repunicode);
6575 if (repsize > 1) {
6576 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006577 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006578 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6579 Py_DECREF(repunicode);
6580 goto onError;
6581 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006582 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006583 ressize += repsize-1;
6584 }
6585 memcpy(str, PyBytes_AsString(repunicode), repsize);
6586 str += repsize;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006587 pos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006588 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006589 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006590 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006591 /* need more space? (at least enough for what we
6592 have+the replacement+the rest of the string, so
6593 we won't have to check space for encodable characters) */
6594 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006595 repsize = PyUnicode_GET_LENGTH(repunicode);
6596 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006597 if (requiredsize > ressize) {
6598 if (requiredsize<2*ressize)
6599 requiredsize = 2*ressize;
6600 if (_PyBytes_Resize(&res, requiredsize)) {
6601 Py_DECREF(repunicode);
6602 goto onError;
6603 }
6604 str = PyBytes_AS_STRING(res) + respos;
6605 ressize = requiredsize;
6606 }
6607 /* check if there is anything unencodable in the replacement
6608 and copy it to the output */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006609 for (i = 0; repsize-->0; ++i, ++str) {
6610 c = PyUnicode_READ_CHAR(repunicode, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00006611 if (c >= limit) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006612 raise_encode_exception(&exc, encoding, unicode,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006613 pos, pos+1, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006614 Py_DECREF(repunicode);
6615 goto onError;
6616 }
6617 *str = (char)c;
6618 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006619 pos = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006620 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006621 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006622 }
6623 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006624 /* Resize if we allocated to much */
6625 size = str - PyBytes_AS_STRING(res);
6626 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006627 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006628 if (_PyBytes_Resize(&res, size) < 0)
6629 goto onError;
6630 }
6631
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006632 Py_XDECREF(errorHandler);
6633 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006634 return res;
6635
6636 onError:
6637 Py_XDECREF(res);
6638 Py_XDECREF(errorHandler);
6639 Py_XDECREF(exc);
6640 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006641}
6642
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006643/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006644PyObject *
6645PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006646 Py_ssize_t size,
6647 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006648{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006649 PyObject *result;
6650 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6651 if (unicode == NULL)
6652 return NULL;
6653 result = unicode_encode_ucs1(unicode, errors, 256);
6654 Py_DECREF(unicode);
6655 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006656}
6657
Alexander Belopolsky40018472011-02-26 01:02:56 +00006658PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006659_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006660{
6661 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006662 PyErr_BadArgument();
6663 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006664 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006665 if (PyUnicode_READY(unicode) == -1)
6666 return NULL;
6667 /* Fast path: if it is a one-byte string, construct
6668 bytes object directly. */
6669 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6670 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6671 PyUnicode_GET_LENGTH(unicode));
6672 /* Non-Latin-1 characters present. Defer to above function to
6673 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006674 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006675}
6676
6677PyObject*
6678PyUnicode_AsLatin1String(PyObject *unicode)
6679{
6680 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006681}
6682
6683/* --- 7-bit ASCII Codec -------------------------------------------------- */
6684
Alexander Belopolsky40018472011-02-26 01:02:56 +00006685PyObject *
6686PyUnicode_DecodeASCII(const char *s,
6687 Py_ssize_t size,
6688 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006689{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006690 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006691 _PyUnicodeWriter writer;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006692 int kind;
6693 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006694 Py_ssize_t startinpos;
6695 Py_ssize_t endinpos;
6696 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006697 const char *e;
6698 PyObject *errorHandler = NULL;
6699 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006700
Guido van Rossumd57fd912000-03-10 22:53:23 +00006701 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02006702 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006703
Guido van Rossumd57fd912000-03-10 22:53:23 +00006704 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006705 if (size == 1 && (unsigned char)s[0] < 128)
6706 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006707
Victor Stinner8f674cc2013-04-17 23:02:17 +02006708 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02006709 writer.min_length = size;
6710 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0)
Victor Stinner8f674cc2013-04-17 23:02:17 +02006711 return NULL;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006712
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006713 e = s + size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006714 data = writer.data;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006715 outpos = ascii_decode(s, e, (Py_UCS1 *)data);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006716 writer.pos = outpos;
6717 if (writer.pos == size)
6718 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006719
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006720 s += writer.pos;
6721 kind = writer.kind;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006722 while (s < e) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02006723 unsigned char c = (unsigned char)*s;
Benjamin Peterson29060642009-01-31 22:14:21 +00006724 if (c < 128) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006725 PyUnicode_WRITE(kind, data, writer.pos, c);
6726 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00006727 ++s;
6728 }
6729 else {
6730 startinpos = s-starts;
6731 endinpos = startinpos + 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006732 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00006733 errors, &errorHandler,
6734 "ascii", "ordinal not in range(128)",
6735 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006736 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006737 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006738 kind = writer.kind;
6739 data = writer.data;
Benjamin Peterson29060642009-01-31 22:14:21 +00006740 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006741 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006742 Py_XDECREF(errorHandler);
6743 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006744 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006745
Benjamin Peterson29060642009-01-31 22:14:21 +00006746 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006747 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006748 Py_XDECREF(errorHandler);
6749 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006750 return NULL;
6751}
6752
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006753/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006754PyObject *
6755PyUnicode_EncodeASCII(const Py_UNICODE *p,
6756 Py_ssize_t size,
6757 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006758{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006759 PyObject *result;
6760 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6761 if (unicode == NULL)
6762 return NULL;
6763 result = unicode_encode_ucs1(unicode, errors, 128);
6764 Py_DECREF(unicode);
6765 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006766}
6767
Alexander Belopolsky40018472011-02-26 01:02:56 +00006768PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006769_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006770{
6771 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006772 PyErr_BadArgument();
6773 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006774 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006775 if (PyUnicode_READY(unicode) == -1)
6776 return NULL;
6777 /* Fast path: if it is an ASCII-only string, construct bytes object
6778 directly. Else defer to above function to raise the exception. */
Victor Stinneraf037572013-04-14 18:44:10 +02006779 if (PyUnicode_IS_ASCII(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006780 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6781 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006782 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006783}
6784
6785PyObject *
6786PyUnicode_AsASCIIString(PyObject *unicode)
6787{
6788 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006789}
6790
Victor Stinner99b95382011-07-04 14:23:54 +02006791#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006792
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006793/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006794
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006795#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006796#define NEED_RETRY
6797#endif
6798
Victor Stinner3a50e702011-10-18 21:21:00 +02006799#ifndef WC_ERR_INVALID_CHARS
6800# define WC_ERR_INVALID_CHARS 0x0080
6801#endif
6802
6803static char*
6804code_page_name(UINT code_page, PyObject **obj)
6805{
6806 *obj = NULL;
6807 if (code_page == CP_ACP)
6808 return "mbcs";
6809 if (code_page == CP_UTF7)
6810 return "CP_UTF7";
6811 if (code_page == CP_UTF8)
6812 return "CP_UTF8";
6813
6814 *obj = PyBytes_FromFormat("cp%u", code_page);
6815 if (*obj == NULL)
6816 return NULL;
6817 return PyBytes_AS_STRING(*obj);
6818}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006819
Alexander Belopolsky40018472011-02-26 01:02:56 +00006820static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006821is_dbcs_lead_byte(UINT code_page, const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006822{
6823 const char *curr = s + offset;
Victor Stinner3a50e702011-10-18 21:21:00 +02006824 const char *prev;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006825
Victor Stinner3a50e702011-10-18 21:21:00 +02006826 if (!IsDBCSLeadByteEx(code_page, *curr))
6827 return 0;
6828
6829 prev = CharPrevExA(code_page, s, curr, 0);
6830 if (prev == curr)
6831 return 1;
6832 /* FIXME: This code is limited to "true" double-byte encodings,
6833 as it assumes an incomplete character consists of a single
6834 byte. */
6835 if (curr - prev == 2)
6836 return 1;
6837 if (!IsDBCSLeadByteEx(code_page, *prev))
6838 return 1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006839 return 0;
6840}
6841
Victor Stinner3a50e702011-10-18 21:21:00 +02006842static DWORD
6843decode_code_page_flags(UINT code_page)
6844{
6845 if (code_page == CP_UTF7) {
6846 /* The CP_UTF7 decoder only supports flags=0 */
6847 return 0;
6848 }
6849 else
6850 return MB_ERR_INVALID_CHARS;
6851}
6852
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006853/*
Victor Stinner3a50e702011-10-18 21:21:00 +02006854 * Decode a byte string from a Windows code page into unicode object in strict
6855 * mode.
6856 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02006857 * Returns consumed size if succeed, returns -2 on decode error, or raise an
6858 * OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006859 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006860static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006861decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006862 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02006863 const char *in,
6864 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006865{
Victor Stinner3a50e702011-10-18 21:21:00 +02006866 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01006867 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02006868 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006869
6870 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02006871 assert(insize > 0);
6872 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
6873 if (outsize <= 0)
6874 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006875
6876 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006877 /* Create unicode object */
Victor Stinnerab595942011-12-17 04:59:06 +01006878 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01006879 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00006880 if (*v == NULL)
6881 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006882 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006883 }
6884 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006885 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02006886 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner16e6a802011-12-12 13:24:15 +01006887 if (unicode_resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006888 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006889 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006890 }
6891
6892 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02006893 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
6894 if (outsize <= 0)
6895 goto error;
6896 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00006897
Victor Stinner3a50e702011-10-18 21:21:00 +02006898error:
6899 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
6900 return -2;
6901 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00006902 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006903}
6904
Victor Stinner3a50e702011-10-18 21:21:00 +02006905/*
6906 * Decode a byte string from a code page into unicode object with an error
6907 * handler.
6908 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02006909 * Returns consumed size if succeed, or raise an OSError or
Victor Stinner3a50e702011-10-18 21:21:00 +02006910 * UnicodeDecodeError exception and returns -1 on error.
6911 */
6912static int
6913decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006914 PyObject **v,
6915 const char *in, const int size,
Victor Stinner3a50e702011-10-18 21:21:00 +02006916 const char *errors)
6917{
6918 const char *startin = in;
6919 const char *endin = in + size;
6920 const DWORD flags = decode_code_page_flags(code_page);
6921 /* Ideally, we should get reason from FormatMessage. This is the Windows
6922 2000 English version of the message. */
6923 const char *reason = "No mapping for the Unicode character exists "
6924 "in the target code page.";
6925 /* each step cannot decode more than 1 character, but a character can be
6926 represented as a surrogate pair */
6927 wchar_t buffer[2], *startout, *out;
Victor Stinner9f067f42013-06-05 00:21:31 +02006928 int insize;
6929 Py_ssize_t outsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02006930 PyObject *errorHandler = NULL;
6931 PyObject *exc = NULL;
6932 PyObject *encoding_obj = NULL;
6933 char *encoding;
6934 DWORD err;
6935 int ret = -1;
6936
6937 assert(size > 0);
6938
6939 encoding = code_page_name(code_page, &encoding_obj);
6940 if (encoding == NULL)
6941 return -1;
6942
6943 if (errors == NULL || strcmp(errors, "strict") == 0) {
6944 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
6945 UnicodeDecodeError. */
6946 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
6947 if (exc != NULL) {
6948 PyCodec_StrictErrors(exc);
6949 Py_CLEAR(exc);
6950 }
6951 goto error;
6952 }
6953
6954 if (*v == NULL) {
6955 /* Create unicode object */
6956 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6957 PyErr_NoMemory();
6958 goto error;
6959 }
Victor Stinnerab595942011-12-17 04:59:06 +01006960 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01006961 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02006962 if (*v == NULL)
6963 goto error;
6964 startout = PyUnicode_AS_UNICODE(*v);
6965 }
6966 else {
6967 /* Extend unicode object */
6968 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
6969 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6970 PyErr_NoMemory();
6971 goto error;
6972 }
Victor Stinner16e6a802011-12-12 13:24:15 +01006973 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02006974 goto error;
6975 startout = PyUnicode_AS_UNICODE(*v) + n;
6976 }
6977
6978 /* Decode the byte string character per character */
6979 out = startout;
6980 while (in < endin)
6981 {
6982 /* Decode a character */
6983 insize = 1;
6984 do
6985 {
6986 outsize = MultiByteToWideChar(code_page, flags,
6987 in, insize,
6988 buffer, Py_ARRAY_LENGTH(buffer));
6989 if (outsize > 0)
6990 break;
6991 err = GetLastError();
6992 if (err != ERROR_NO_UNICODE_TRANSLATION
6993 && err != ERROR_INSUFFICIENT_BUFFER)
6994 {
6995 PyErr_SetFromWindowsErr(0);
6996 goto error;
6997 }
6998 insize++;
6999 }
7000 /* 4=maximum length of a UTF-8 sequence */
7001 while (insize <= 4 && (in + insize) <= endin);
7002
7003 if (outsize <= 0) {
7004 Py_ssize_t startinpos, endinpos, outpos;
7005
7006 startinpos = in - startin;
7007 endinpos = startinpos + 1;
7008 outpos = out - PyUnicode_AS_UNICODE(*v);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007009 if (unicode_decode_call_errorhandler_wchar(
Victor Stinner3a50e702011-10-18 21:21:00 +02007010 errors, &errorHandler,
7011 encoding, reason,
7012 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01007013 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007014 {
7015 goto error;
7016 }
Victor Stinner596a6c42011-11-09 00:02:18 +01007017 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007018 }
7019 else {
7020 in += insize;
7021 memcpy(out, buffer, outsize * sizeof(wchar_t));
7022 out += outsize;
7023 }
7024 }
7025
7026 /* write a NUL character at the end */
7027 *out = 0;
7028
7029 /* Extend unicode object */
7030 outsize = out - startout;
7031 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner16e6a802011-12-12 13:24:15 +01007032 if (unicode_resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007033 goto error;
Victor Stinner76a31a62011-11-04 00:05:13 +01007034 ret = size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007035
7036error:
7037 Py_XDECREF(encoding_obj);
7038 Py_XDECREF(errorHandler);
7039 Py_XDECREF(exc);
7040 return ret;
7041}
7042
Victor Stinner3a50e702011-10-18 21:21:00 +02007043static PyObject *
7044decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007045 const char *s, Py_ssize_t size,
7046 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007047{
Victor Stinner76a31a62011-11-04 00:05:13 +01007048 PyObject *v = NULL;
7049 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007050
Victor Stinner3a50e702011-10-18 21:21:00 +02007051 if (code_page < 0) {
7052 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7053 return NULL;
7054 }
7055
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007056 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007057 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007058
Victor Stinner76a31a62011-11-04 00:05:13 +01007059 do
7060 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007061#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01007062 if (size > INT_MAX) {
7063 chunk_size = INT_MAX;
7064 final = 0;
7065 done = 0;
7066 }
7067 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007068#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007069 {
7070 chunk_size = (int)size;
7071 final = (consumed == NULL);
7072 done = 1;
7073 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007074
Victor Stinner76a31a62011-11-04 00:05:13 +01007075 /* Skip trailing lead-byte unless 'final' is set */
7076 if (!final && is_dbcs_lead_byte(code_page, s, chunk_size - 1))
7077 --chunk_size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007078
Victor Stinner76a31a62011-11-04 00:05:13 +01007079 if (chunk_size == 0 && done) {
7080 if (v != NULL)
7081 break;
Serhiy Storchaka678db842013-01-26 12:16:36 +02007082 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner76a31a62011-11-04 00:05:13 +01007083 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007084
Victor Stinner76a31a62011-11-04 00:05:13 +01007085
7086 converted = decode_code_page_strict(code_page, &v,
7087 s, chunk_size);
7088 if (converted == -2)
7089 converted = decode_code_page_errors(code_page, &v,
7090 s, chunk_size,
7091 errors);
7092 assert(converted != 0);
7093
7094 if (converted < 0) {
7095 Py_XDECREF(v);
7096 return NULL;
7097 }
7098
7099 if (consumed)
7100 *consumed += converted;
7101
7102 s += converted;
7103 size -= converted;
7104 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007105
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007106 return unicode_result(v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007107}
7108
Alexander Belopolsky40018472011-02-26 01:02:56 +00007109PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007110PyUnicode_DecodeCodePageStateful(int code_page,
7111 const char *s,
7112 Py_ssize_t size,
7113 const char *errors,
7114 Py_ssize_t *consumed)
7115{
7116 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7117}
7118
7119PyObject *
7120PyUnicode_DecodeMBCSStateful(const char *s,
7121 Py_ssize_t size,
7122 const char *errors,
7123 Py_ssize_t *consumed)
7124{
7125 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7126}
7127
7128PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007129PyUnicode_DecodeMBCS(const char *s,
7130 Py_ssize_t size,
7131 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007132{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007133 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7134}
7135
Victor Stinner3a50e702011-10-18 21:21:00 +02007136static DWORD
7137encode_code_page_flags(UINT code_page, const char *errors)
7138{
7139 if (code_page == CP_UTF8) {
7140 if (winver.dwMajorVersion >= 6)
7141 /* CP_UTF8 supports WC_ERR_INVALID_CHARS on Windows Vista
7142 and later */
7143 return WC_ERR_INVALID_CHARS;
7144 else
7145 /* CP_UTF8 only supports flags=0 on Windows older than Vista */
7146 return 0;
7147 }
7148 else if (code_page == CP_UTF7) {
7149 /* CP_UTF7 only supports flags=0 */
7150 return 0;
7151 }
7152 else {
7153 if (errors != NULL && strcmp(errors, "replace") == 0)
7154 return 0;
7155 else
7156 return WC_NO_BEST_FIT_CHARS;
7157 }
7158}
7159
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007160/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007161 * Encode a Unicode string to a Windows code page into a byte string in strict
7162 * mode.
7163 *
7164 * Returns consumed characters if succeed, returns -2 on encode error, or raise
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007165 * an OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007166 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007167static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007168encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007169 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007170 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007171{
Victor Stinner554f3f02010-06-16 23:33:54 +00007172 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007173 BOOL *pusedDefaultChar = &usedDefaultChar;
7174 int outsize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007175 PyObject *exc = NULL;
Victor Stinner24729f32011-11-10 20:31:37 +01007176 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007177 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007178 const DWORD flags = encode_code_page_flags(code_page, NULL);
7179 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007180 /* Create a substring so that we can get the UTF-16 representation
7181 of just the slice under consideration. */
7182 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007183
Martin v. Löwis3d325192011-11-04 18:23:06 +01007184 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007185
Victor Stinner3a50e702011-10-18 21:21:00 +02007186 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007187 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007188 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007189 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007190
Victor Stinner2fc507f2011-11-04 20:06:39 +01007191 substring = PyUnicode_Substring(unicode, offset, offset+len);
7192 if (substring == NULL)
7193 return -1;
7194 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7195 if (p == NULL) {
7196 Py_DECREF(substring);
7197 return -1;
7198 }
Victor Stinner9f067f42013-06-05 00:21:31 +02007199 assert(size <= INT_MAX);
Martin v. Löwis3d325192011-11-04 18:23:06 +01007200
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007201 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007202 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007203 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007204 NULL, 0,
7205 NULL, pusedDefaultChar);
7206 if (outsize <= 0)
7207 goto error;
7208 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007209 if (pusedDefaultChar && *pusedDefaultChar) {
7210 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007211 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007212 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007213
Victor Stinner3a50e702011-10-18 21:21:00 +02007214 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007215 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007216 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007217 if (*outbytes == NULL) {
7218 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007219 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007220 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007221 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007222 }
7223 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007224 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007225 const Py_ssize_t n = PyBytes_Size(*outbytes);
7226 if (outsize > PY_SSIZE_T_MAX - n) {
7227 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007228 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007229 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007230 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007231 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7232 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007233 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007234 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007235 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007236 }
7237
7238 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007239 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007240 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007241 out, outsize,
7242 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007243 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007244 if (outsize <= 0)
7245 goto error;
7246 if (pusedDefaultChar && *pusedDefaultChar)
7247 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007248 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007249
Victor Stinner3a50e702011-10-18 21:21:00 +02007250error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007251 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007252 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7253 return -2;
7254 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007255 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007256}
7257
Victor Stinner3a50e702011-10-18 21:21:00 +02007258/*
7259 * Encode a Unicode string to a Windows code page into a byte string using a
7260 * error handler.
7261 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007262 * Returns consumed characters if succeed, or raise an OSError and returns
Victor Stinner3a50e702011-10-18 21:21:00 +02007263 * -1 on other error.
7264 */
7265static int
7266encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007267 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007268 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007269{
Victor Stinner3a50e702011-10-18 21:21:00 +02007270 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007271 Py_ssize_t pos = unicode_offset;
7272 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007273 /* Ideally, we should get reason from FormatMessage. This is the Windows
7274 2000 English version of the message. */
7275 const char *reason = "invalid character";
7276 /* 4=maximum length of a UTF-8 sequence */
7277 char buffer[4];
7278 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7279 Py_ssize_t outsize;
7280 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007281 PyObject *errorHandler = NULL;
7282 PyObject *exc = NULL;
7283 PyObject *encoding_obj = NULL;
7284 char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007285 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007286 PyObject *rep;
7287 int ret = -1;
7288
7289 assert(insize > 0);
7290
7291 encoding = code_page_name(code_page, &encoding_obj);
7292 if (encoding == NULL)
7293 return -1;
7294
7295 if (errors == NULL || strcmp(errors, "strict") == 0) {
7296 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7297 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007298 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007299 if (exc != NULL) {
7300 PyCodec_StrictErrors(exc);
7301 Py_DECREF(exc);
7302 }
7303 Py_XDECREF(encoding_obj);
7304 return -1;
7305 }
7306
7307 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7308 pusedDefaultChar = &usedDefaultChar;
7309 else
7310 pusedDefaultChar = NULL;
7311
7312 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7313 PyErr_NoMemory();
7314 goto error;
7315 }
7316 outsize = insize * Py_ARRAY_LENGTH(buffer);
7317
7318 if (*outbytes == NULL) {
7319 /* Create string object */
7320 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7321 if (*outbytes == NULL)
7322 goto error;
7323 out = PyBytes_AS_STRING(*outbytes);
7324 }
7325 else {
7326 /* Extend string object */
7327 Py_ssize_t n = PyBytes_Size(*outbytes);
7328 if (n > PY_SSIZE_T_MAX - outsize) {
7329 PyErr_NoMemory();
7330 goto error;
7331 }
7332 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7333 goto error;
7334 out = PyBytes_AS_STRING(*outbytes) + n;
7335 }
7336
7337 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007338 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007339 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007340 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7341 wchar_t chars[2];
7342 int charsize;
7343 if (ch < 0x10000) {
7344 chars[0] = (wchar_t)ch;
7345 charsize = 1;
7346 }
7347 else {
Victor Stinner76df43d2012-10-30 01:42:39 +01007348 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7349 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007350 charsize = 2;
7351 }
7352
Victor Stinner3a50e702011-10-18 21:21:00 +02007353 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007354 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007355 buffer, Py_ARRAY_LENGTH(buffer),
7356 NULL, pusedDefaultChar);
7357 if (outsize > 0) {
7358 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7359 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007360 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007361 memcpy(out, buffer, outsize);
7362 out += outsize;
7363 continue;
7364 }
7365 }
7366 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7367 PyErr_SetFromWindowsErr(0);
7368 goto error;
7369 }
7370
Victor Stinner3a50e702011-10-18 21:21:00 +02007371 rep = unicode_encode_call_errorhandler(
7372 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007373 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007374 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007375 if (rep == NULL)
7376 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007377 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007378
7379 if (PyBytes_Check(rep)) {
7380 outsize = PyBytes_GET_SIZE(rep);
7381 if (outsize != 1) {
7382 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7383 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7384 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7385 Py_DECREF(rep);
7386 goto error;
7387 }
7388 out = PyBytes_AS_STRING(*outbytes) + offset;
7389 }
7390 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7391 out += outsize;
7392 }
7393 else {
7394 Py_ssize_t i;
7395 enum PyUnicode_Kind kind;
7396 void *data;
7397
Benjamin Petersonbac79492012-01-14 13:34:47 -05007398 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007399 Py_DECREF(rep);
7400 goto error;
7401 }
7402
7403 outsize = PyUnicode_GET_LENGTH(rep);
7404 if (outsize != 1) {
7405 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7406 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7407 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7408 Py_DECREF(rep);
7409 goto error;
7410 }
7411 out = PyBytes_AS_STRING(*outbytes) + offset;
7412 }
7413 kind = PyUnicode_KIND(rep);
7414 data = PyUnicode_DATA(rep);
7415 for (i=0; i < outsize; i++) {
7416 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7417 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007418 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007419 encoding, unicode,
7420 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007421 "unable to encode error handler result to ASCII");
7422 Py_DECREF(rep);
7423 goto error;
7424 }
7425 *out = (unsigned char)ch;
7426 out++;
7427 }
7428 }
7429 Py_DECREF(rep);
7430 }
7431 /* write a NUL byte */
7432 *out = 0;
7433 outsize = out - PyBytes_AS_STRING(*outbytes);
7434 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7435 if (_PyBytes_Resize(outbytes, outsize) < 0)
7436 goto error;
7437 ret = 0;
7438
7439error:
7440 Py_XDECREF(encoding_obj);
7441 Py_XDECREF(errorHandler);
7442 Py_XDECREF(exc);
7443 return ret;
7444}
7445
Victor Stinner3a50e702011-10-18 21:21:00 +02007446static PyObject *
7447encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007448 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007449 const char *errors)
7450{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007451 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007452 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007453 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007454 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007455
Benjamin Petersonbac79492012-01-14 13:34:47 -05007456 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007457 return NULL;
7458 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007459
Victor Stinner3a50e702011-10-18 21:21:00 +02007460 if (code_page < 0) {
7461 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7462 return NULL;
7463 }
7464
Martin v. Löwis3d325192011-11-04 18:23:06 +01007465 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007466 return PyBytes_FromStringAndSize(NULL, 0);
7467
Victor Stinner7581cef2011-11-03 22:32:33 +01007468 offset = 0;
7469 do
7470 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007471#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007472 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007473 chunks. */
7474 if (len > INT_MAX/2) {
7475 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007476 done = 0;
7477 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007478 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007479#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007480 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007481 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007482 done = 1;
7483 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007484
Victor Stinner76a31a62011-11-04 00:05:13 +01007485 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007486 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007487 errors);
7488 if (ret == -2)
7489 ret = encode_code_page_errors(code_page, &outbytes,
7490 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007491 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007492 if (ret < 0) {
7493 Py_XDECREF(outbytes);
7494 return NULL;
7495 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007496
Victor Stinner7581cef2011-11-03 22:32:33 +01007497 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007498 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007499 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007500
Victor Stinner3a50e702011-10-18 21:21:00 +02007501 return outbytes;
7502}
7503
7504PyObject *
7505PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7506 Py_ssize_t size,
7507 const char *errors)
7508{
Victor Stinner7581cef2011-11-03 22:32:33 +01007509 PyObject *unicode, *res;
7510 unicode = PyUnicode_FromUnicode(p, size);
7511 if (unicode == NULL)
7512 return NULL;
7513 res = encode_code_page(CP_ACP, unicode, errors);
7514 Py_DECREF(unicode);
7515 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007516}
7517
7518PyObject *
7519PyUnicode_EncodeCodePage(int code_page,
7520 PyObject *unicode,
7521 const char *errors)
7522{
Victor Stinner7581cef2011-11-03 22:32:33 +01007523 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007524}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007525
Alexander Belopolsky40018472011-02-26 01:02:56 +00007526PyObject *
7527PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007528{
7529 if (!PyUnicode_Check(unicode)) {
7530 PyErr_BadArgument();
7531 return NULL;
7532 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007533 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007534}
7535
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007536#undef NEED_RETRY
7537
Victor Stinner99b95382011-07-04 14:23:54 +02007538#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007539
Guido van Rossumd57fd912000-03-10 22:53:23 +00007540/* --- Character Mapping Codec -------------------------------------------- */
7541
Victor Stinnerfb161b12013-04-18 01:44:27 +02007542static int
7543charmap_decode_string(const char *s,
7544 Py_ssize_t size,
7545 PyObject *mapping,
7546 const char *errors,
7547 _PyUnicodeWriter *writer)
7548{
7549 const char *starts = s;
7550 const char *e;
7551 Py_ssize_t startinpos, endinpos;
7552 PyObject *errorHandler = NULL, *exc = NULL;
7553 Py_ssize_t maplen;
7554 enum PyUnicode_Kind mapkind;
7555 void *mapdata;
7556 Py_UCS4 x;
7557 unsigned char ch;
7558
7559 if (PyUnicode_READY(mapping) == -1)
7560 return -1;
7561
7562 maplen = PyUnicode_GET_LENGTH(mapping);
7563 mapdata = PyUnicode_DATA(mapping);
7564 mapkind = PyUnicode_KIND(mapping);
7565
7566 e = s + size;
7567
7568 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
7569 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
7570 * is disabled in encoding aliases, latin1 is preferred because
7571 * its implementation is faster. */
7572 Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata;
7573 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7574 Py_UCS4 maxchar = writer->maxchar;
7575
7576 assert (writer->kind == PyUnicode_1BYTE_KIND);
7577 while (s < e) {
7578 ch = *s;
7579 x = mapdata_ucs1[ch];
7580 if (x > maxchar) {
7581 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
7582 goto onError;
7583 maxchar = writer->maxchar;
7584 outdata = (Py_UCS1 *)writer->data;
7585 }
7586 outdata[writer->pos] = x;
7587 writer->pos++;
7588 ++s;
7589 }
7590 return 0;
7591 }
7592
7593 while (s < e) {
7594 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
7595 enum PyUnicode_Kind outkind = writer->kind;
7596 Py_UCS2 *mapdata_ucs2 = (Py_UCS2 *)mapdata;
7597 if (outkind == PyUnicode_1BYTE_KIND) {
7598 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7599 Py_UCS4 maxchar = writer->maxchar;
7600 while (s < e) {
7601 ch = *s;
7602 x = mapdata_ucs2[ch];
7603 if (x > maxchar)
7604 goto Error;
7605 outdata[writer->pos] = x;
7606 writer->pos++;
7607 ++s;
7608 }
7609 break;
7610 }
7611 else if (outkind == PyUnicode_2BYTE_KIND) {
7612 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
7613 while (s < e) {
7614 ch = *s;
7615 x = mapdata_ucs2[ch];
7616 if (x == 0xFFFE)
7617 goto Error;
7618 outdata[writer->pos] = x;
7619 writer->pos++;
7620 ++s;
7621 }
7622 break;
7623 }
7624 }
7625 ch = *s;
7626
7627 if (ch < maplen)
7628 x = PyUnicode_READ(mapkind, mapdata, ch);
7629 else
7630 x = 0xfffe; /* invalid value */
7631Error:
7632 if (x == 0xfffe)
7633 {
7634 /* undefined mapping */
7635 startinpos = s-starts;
7636 endinpos = startinpos+1;
7637 if (unicode_decode_call_errorhandler_writer(
7638 errors, &errorHandler,
7639 "charmap", "character maps to <undefined>",
7640 &starts, &e, &startinpos, &endinpos, &exc, &s,
7641 writer)) {
7642 goto onError;
7643 }
7644 continue;
7645 }
7646
7647 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
7648 goto onError;
7649 ++s;
7650 }
7651 Py_XDECREF(errorHandler);
7652 Py_XDECREF(exc);
7653 return 0;
7654
7655onError:
7656 Py_XDECREF(errorHandler);
7657 Py_XDECREF(exc);
7658 return -1;
7659}
7660
7661static int
7662charmap_decode_mapping(const char *s,
7663 Py_ssize_t size,
7664 PyObject *mapping,
7665 const char *errors,
7666 _PyUnicodeWriter *writer)
7667{
7668 const char *starts = s;
7669 const char *e;
7670 Py_ssize_t startinpos, endinpos;
7671 PyObject *errorHandler = NULL, *exc = NULL;
7672 unsigned char ch;
Victor Stinnerf4f24242013-05-07 01:01:31 +02007673 PyObject *key, *item = NULL;
Victor Stinnerfb161b12013-04-18 01:44:27 +02007674
7675 e = s + size;
7676
7677 while (s < e) {
7678 ch = *s;
7679
7680 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7681 key = PyLong_FromLong((long)ch);
7682 if (key == NULL)
7683 goto onError;
7684
7685 item = PyObject_GetItem(mapping, key);
7686 Py_DECREF(key);
7687 if (item == NULL) {
7688 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7689 /* No mapping found means: mapping is undefined. */
7690 PyErr_Clear();
7691 goto Undefined;
7692 } else
7693 goto onError;
7694 }
7695
7696 /* Apply mapping */
7697 if (item == Py_None)
7698 goto Undefined;
7699 if (PyLong_Check(item)) {
7700 long value = PyLong_AS_LONG(item);
7701 if (value == 0xFFFE)
7702 goto Undefined;
7703 if (value < 0 || value > MAX_UNICODE) {
7704 PyErr_Format(PyExc_TypeError,
7705 "character mapping must be in range(0x%lx)",
7706 (unsigned long)MAX_UNICODE + 1);
7707 goto onError;
7708 }
7709
7710 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7711 goto onError;
7712 }
7713 else if (PyUnicode_Check(item)) {
7714 if (PyUnicode_READY(item) == -1)
7715 goto onError;
7716 if (PyUnicode_GET_LENGTH(item) == 1) {
7717 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
7718 if (value == 0xFFFE)
7719 goto Undefined;
7720 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7721 goto onError;
7722 }
7723 else {
7724 writer->overallocate = 1;
7725 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
7726 goto onError;
7727 }
7728 }
7729 else {
7730 /* wrong return value */
7731 PyErr_SetString(PyExc_TypeError,
7732 "character mapping must return integer, None or str");
7733 goto onError;
7734 }
7735 Py_CLEAR(item);
7736 ++s;
7737 continue;
7738
7739Undefined:
7740 /* undefined mapping */
7741 Py_CLEAR(item);
7742 startinpos = s-starts;
7743 endinpos = startinpos+1;
7744 if (unicode_decode_call_errorhandler_writer(
7745 errors, &errorHandler,
7746 "charmap", "character maps to <undefined>",
7747 &starts, &e, &startinpos, &endinpos, &exc, &s,
7748 writer)) {
7749 goto onError;
7750 }
7751 }
7752 Py_XDECREF(errorHandler);
7753 Py_XDECREF(exc);
7754 return 0;
7755
7756onError:
7757 Py_XDECREF(item);
7758 Py_XDECREF(errorHandler);
7759 Py_XDECREF(exc);
7760 return -1;
7761}
7762
Alexander Belopolsky40018472011-02-26 01:02:56 +00007763PyObject *
7764PyUnicode_DecodeCharmap(const char *s,
7765 Py_ssize_t size,
7766 PyObject *mapping,
7767 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007768{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007769 _PyUnicodeWriter writer;
Tim Petersced69f82003-09-16 20:30:58 +00007770
Guido van Rossumd57fd912000-03-10 22:53:23 +00007771 /* Default to Latin-1 */
7772 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007773 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007774
Guido van Rossumd57fd912000-03-10 22:53:23 +00007775 if (size == 0)
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02007776 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner8f674cc2013-04-17 23:02:17 +02007777 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02007778 writer.min_length = size;
7779 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007780 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007781
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007782 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerfb161b12013-04-18 01:44:27 +02007783 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
7784 goto onError;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007785 }
7786 else {
Victor Stinnerfb161b12013-04-18 01:44:27 +02007787 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
7788 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007789 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007790 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00007791
Benjamin Peterson29060642009-01-31 22:14:21 +00007792 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007793 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007794 return NULL;
7795}
7796
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007797/* Charmap encoding: the lookup table */
7798
Alexander Belopolsky40018472011-02-26 01:02:56 +00007799struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00007800 PyObject_HEAD
7801 unsigned char level1[32];
7802 int count2, count3;
7803 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007804};
7805
7806static PyObject*
7807encoding_map_size(PyObject *obj, PyObject* args)
7808{
7809 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007810 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00007811 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007812}
7813
7814static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007815 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00007816 PyDoc_STR("Return the size (in bytes) of this object") },
7817 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007818};
7819
7820static void
7821encoding_map_dealloc(PyObject* o)
7822{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007823 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007824}
7825
7826static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007827 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007828 "EncodingMap", /*tp_name*/
7829 sizeof(struct encoding_map), /*tp_basicsize*/
7830 0, /*tp_itemsize*/
7831 /* methods */
7832 encoding_map_dealloc, /*tp_dealloc*/
7833 0, /*tp_print*/
7834 0, /*tp_getattr*/
7835 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00007836 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00007837 0, /*tp_repr*/
7838 0, /*tp_as_number*/
7839 0, /*tp_as_sequence*/
7840 0, /*tp_as_mapping*/
7841 0, /*tp_hash*/
7842 0, /*tp_call*/
7843 0, /*tp_str*/
7844 0, /*tp_getattro*/
7845 0, /*tp_setattro*/
7846 0, /*tp_as_buffer*/
7847 Py_TPFLAGS_DEFAULT, /*tp_flags*/
7848 0, /*tp_doc*/
7849 0, /*tp_traverse*/
7850 0, /*tp_clear*/
7851 0, /*tp_richcompare*/
7852 0, /*tp_weaklistoffset*/
7853 0, /*tp_iter*/
7854 0, /*tp_iternext*/
7855 encoding_map_methods, /*tp_methods*/
7856 0, /*tp_members*/
7857 0, /*tp_getset*/
7858 0, /*tp_base*/
7859 0, /*tp_dict*/
7860 0, /*tp_descr_get*/
7861 0, /*tp_descr_set*/
7862 0, /*tp_dictoffset*/
7863 0, /*tp_init*/
7864 0, /*tp_alloc*/
7865 0, /*tp_new*/
7866 0, /*tp_free*/
7867 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007868};
7869
7870PyObject*
7871PyUnicode_BuildEncodingMap(PyObject* string)
7872{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007873 PyObject *result;
7874 struct encoding_map *mresult;
7875 int i;
7876 int need_dict = 0;
7877 unsigned char level1[32];
7878 unsigned char level2[512];
7879 unsigned char *mlevel1, *mlevel2, *mlevel3;
7880 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007881 int kind;
7882 void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007883 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007884 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007885
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007886 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007887 PyErr_BadArgument();
7888 return NULL;
7889 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007890 kind = PyUnicode_KIND(string);
7891 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007892 length = PyUnicode_GET_LENGTH(string);
7893 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007894 memset(level1, 0xFF, sizeof level1);
7895 memset(level2, 0xFF, sizeof level2);
7896
7897 /* If there isn't a one-to-one mapping of NULL to \0,
7898 or if there are non-BMP characters, we need to use
7899 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007900 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007901 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007902 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007903 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007904 ch = PyUnicode_READ(kind, data, i);
7905 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007906 need_dict = 1;
7907 break;
7908 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007909 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007910 /* unmapped character */
7911 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007912 l1 = ch >> 11;
7913 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007914 if (level1[l1] == 0xFF)
7915 level1[l1] = count2++;
7916 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00007917 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007918 }
7919
7920 if (count2 >= 0xFF || count3 >= 0xFF)
7921 need_dict = 1;
7922
7923 if (need_dict) {
7924 PyObject *result = PyDict_New();
7925 PyObject *key, *value;
7926 if (!result)
7927 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007928 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007929 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00007930 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007931 if (!key || !value)
7932 goto failed1;
7933 if (PyDict_SetItem(result, key, value) == -1)
7934 goto failed1;
7935 Py_DECREF(key);
7936 Py_DECREF(value);
7937 }
7938 return result;
7939 failed1:
7940 Py_XDECREF(key);
7941 Py_XDECREF(value);
7942 Py_DECREF(result);
7943 return NULL;
7944 }
7945
7946 /* Create a three-level trie */
7947 result = PyObject_MALLOC(sizeof(struct encoding_map) +
7948 16*count2 + 128*count3 - 1);
7949 if (!result)
7950 return PyErr_NoMemory();
7951 PyObject_Init(result, &EncodingMapType);
7952 mresult = (struct encoding_map*)result;
7953 mresult->count2 = count2;
7954 mresult->count3 = count3;
7955 mlevel1 = mresult->level1;
7956 mlevel2 = mresult->level23;
7957 mlevel3 = mresult->level23 + 16*count2;
7958 memcpy(mlevel1, level1, 32);
7959 memset(mlevel2, 0xFF, 16*count2);
7960 memset(mlevel3, 0, 128*count3);
7961 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007962 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007963 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007964 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7965 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007966 /* unmapped character */
7967 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007968 o1 = ch>>11;
7969 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007970 i2 = 16*mlevel1[o1] + o2;
7971 if (mlevel2[i2] == 0xFF)
7972 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007973 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007974 i3 = 128*mlevel2[i2] + o3;
7975 mlevel3[i3] = i;
7976 }
7977 return result;
7978}
7979
7980static int
Victor Stinner22168992011-11-20 17:09:18 +01007981encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007982{
7983 struct encoding_map *map = (struct encoding_map*)mapping;
7984 int l1 = c>>11;
7985 int l2 = (c>>7) & 0xF;
7986 int l3 = c & 0x7F;
7987 int i;
7988
Victor Stinner22168992011-11-20 17:09:18 +01007989 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00007990 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007991 if (c == 0)
7992 return 0;
7993 /* level 1*/
7994 i = map->level1[l1];
7995 if (i == 0xFF) {
7996 return -1;
7997 }
7998 /* level 2*/
7999 i = map->level23[16*i+l2];
8000 if (i == 0xFF) {
8001 return -1;
8002 }
8003 /* level 3 */
8004 i = map->level23[16*map->count2 + 128*i + l3];
8005 if (i == 0) {
8006 return -1;
8007 }
8008 return i;
8009}
8010
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008011/* Lookup the character ch in the mapping. If the character
8012 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00008013 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008014static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01008015charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008016{
Christian Heimes217cfd12007-12-02 14:31:20 +00008017 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008018 PyObject *x;
8019
8020 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008021 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008022 x = PyObject_GetItem(mapping, w);
8023 Py_DECREF(w);
8024 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008025 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8026 /* No mapping found means: mapping is undefined. */
8027 PyErr_Clear();
8028 x = Py_None;
8029 Py_INCREF(x);
8030 return x;
8031 } else
8032 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008033 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008034 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008035 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008036 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008037 long value = PyLong_AS_LONG(x);
8038 if (value < 0 || value > 255) {
8039 PyErr_SetString(PyExc_TypeError,
8040 "character mapping must be in range(256)");
8041 Py_DECREF(x);
8042 return NULL;
8043 }
8044 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008045 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008046 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008047 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008048 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008049 /* wrong return value */
8050 PyErr_Format(PyExc_TypeError,
8051 "character mapping must return integer, bytes or None, not %.400s",
8052 x->ob_type->tp_name);
8053 Py_DECREF(x);
8054 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008055 }
8056}
8057
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008058static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008059charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008060{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008061 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8062 /* exponentially overallocate to minimize reallocations */
8063 if (requiredsize < 2*outsize)
8064 requiredsize = 2*outsize;
8065 if (_PyBytes_Resize(outobj, requiredsize))
8066 return -1;
8067 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008068}
8069
Benjamin Peterson14339b62009-01-31 16:36:08 +00008070typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008071 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008072} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008073/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008074 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008075 space is available. Return a new reference to the object that
8076 was put in the output buffer, or Py_None, if the mapping was undefined
8077 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008078 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008079static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008080charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008081 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008082{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008083 PyObject *rep;
8084 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008085 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008086
Christian Heimes90aa7642007-12-19 02:45:37 +00008087 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008088 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008089 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008090 if (res == -1)
8091 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008092 if (outsize<requiredsize)
8093 if (charmapencode_resize(outobj, outpos, requiredsize))
8094 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008095 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008096 outstart[(*outpos)++] = (char)res;
8097 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008098 }
8099
8100 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008101 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008102 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008103 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008104 Py_DECREF(rep);
8105 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008106 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008107 if (PyLong_Check(rep)) {
8108 Py_ssize_t requiredsize = *outpos+1;
8109 if (outsize<requiredsize)
8110 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8111 Py_DECREF(rep);
8112 return enc_EXCEPTION;
8113 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008114 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008115 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008116 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008117 else {
8118 const char *repchars = PyBytes_AS_STRING(rep);
8119 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8120 Py_ssize_t requiredsize = *outpos+repsize;
8121 if (outsize<requiredsize)
8122 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8123 Py_DECREF(rep);
8124 return enc_EXCEPTION;
8125 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008126 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008127 memcpy(outstart + *outpos, repchars, repsize);
8128 *outpos += repsize;
8129 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008130 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008131 Py_DECREF(rep);
8132 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008133}
8134
8135/* handle an error in PyUnicode_EncodeCharmap
8136 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008137static int
8138charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008139 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008140 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00008141 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008142 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008143{
8144 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008145 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008146 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008147 enum PyUnicode_Kind kind;
8148 void *data;
8149 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008150 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008151 Py_ssize_t collstartpos = *inpos;
8152 Py_ssize_t collendpos = *inpos+1;
8153 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008154 char *encoding = "charmap";
8155 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008156 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008157 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008158 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008159
Benjamin Petersonbac79492012-01-14 13:34:47 -05008160 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008161 return -1;
8162 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008163 /* find all unencodable characters */
8164 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008165 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008166 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008167 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008168 val = encoding_map_lookup(ch, mapping);
8169 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008170 break;
8171 ++collendpos;
8172 continue;
8173 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008174
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008175 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8176 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008177 if (rep==NULL)
8178 return -1;
8179 else if (rep!=Py_None) {
8180 Py_DECREF(rep);
8181 break;
8182 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008183 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008184 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008185 }
8186 /* cache callback name lookup
8187 * (if not done yet, i.e. it's the first error) */
8188 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008189 if ((errors==NULL) || (!strcmp(errors, "strict")))
8190 *known_errorHandler = 1;
8191 else if (!strcmp(errors, "replace"))
8192 *known_errorHandler = 2;
8193 else if (!strcmp(errors, "ignore"))
8194 *known_errorHandler = 3;
8195 else if (!strcmp(errors, "xmlcharrefreplace"))
8196 *known_errorHandler = 4;
8197 else
8198 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008199 }
8200 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008201 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008202 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008203 return -1;
8204 case 2: /* replace */
8205 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008206 x = charmapencode_output('?', mapping, res, respos);
8207 if (x==enc_EXCEPTION) {
8208 return -1;
8209 }
8210 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008211 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008212 return -1;
8213 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008214 }
8215 /* fall through */
8216 case 3: /* ignore */
8217 *inpos = collendpos;
8218 break;
8219 case 4: /* xmlcharrefreplace */
8220 /* generate replacement (temporarily (mis)uses p) */
8221 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008222 char buffer[2+29+1+1];
8223 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008224 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008225 for (cp = buffer; *cp; ++cp) {
8226 x = charmapencode_output(*cp, mapping, res, respos);
8227 if (x==enc_EXCEPTION)
8228 return -1;
8229 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008230 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008231 return -1;
8232 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008233 }
8234 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008235 *inpos = collendpos;
8236 break;
8237 default:
8238 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008239 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008240 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008241 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008242 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008243 if (PyBytes_Check(repunicode)) {
8244 /* Directly copy bytes result to output. */
8245 Py_ssize_t outsize = PyBytes_Size(*res);
8246 Py_ssize_t requiredsize;
8247 repsize = PyBytes_Size(repunicode);
8248 requiredsize = *respos + repsize;
8249 if (requiredsize > outsize)
8250 /* Make room for all additional bytes. */
8251 if (charmapencode_resize(res, respos, requiredsize)) {
8252 Py_DECREF(repunicode);
8253 return -1;
8254 }
8255 memcpy(PyBytes_AsString(*res) + *respos,
8256 PyBytes_AsString(repunicode), repsize);
8257 *respos += repsize;
8258 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008259 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008260 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008261 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008262 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008263 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008264 Py_DECREF(repunicode);
8265 return -1;
8266 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008267 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008268 data = PyUnicode_DATA(repunicode);
8269 kind = PyUnicode_KIND(repunicode);
8270 for (index = 0; index < repsize; index++) {
8271 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8272 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008273 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008274 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008275 return -1;
8276 }
8277 else if (x==enc_FAILED) {
8278 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008279 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008280 return -1;
8281 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008282 }
8283 *inpos = newpos;
8284 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008285 }
8286 return 0;
8287}
8288
Alexander Belopolsky40018472011-02-26 01:02:56 +00008289PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008290_PyUnicode_EncodeCharmap(PyObject *unicode,
8291 PyObject *mapping,
8292 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008293{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008294 /* output object */
8295 PyObject *res = NULL;
8296 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008297 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008298 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008299 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008300 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008301 PyObject *errorHandler = NULL;
8302 PyObject *exc = NULL;
8303 /* the following variable is used for caching string comparisons
8304 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8305 * 3=ignore, 4=xmlcharrefreplace */
8306 int known_errorHandler = -1;
Victor Stinner69ed0f42013-04-09 21:48:24 +02008307 void *data;
8308 int kind;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008309
Benjamin Petersonbac79492012-01-14 13:34:47 -05008310 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008311 return NULL;
8312 size = PyUnicode_GET_LENGTH(unicode);
Victor Stinner69ed0f42013-04-09 21:48:24 +02008313 data = PyUnicode_DATA(unicode);
8314 kind = PyUnicode_KIND(unicode);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008315
Guido van Rossumd57fd912000-03-10 22:53:23 +00008316 /* Default to Latin-1 */
8317 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008318 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008319
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008320 /* allocate enough for a simple encoding without
8321 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008322 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008323 if (res == NULL)
8324 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008325 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008326 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008327
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008328 while (inpos<size) {
Victor Stinner69ed0f42013-04-09 21:48:24 +02008329 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008330 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008331 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008332 if (x==enc_EXCEPTION) /* error */
8333 goto onError;
8334 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008335 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008336 &exc,
8337 &known_errorHandler, &errorHandler, errors,
8338 &res, &respos)) {
8339 goto onError;
8340 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008341 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008342 else
8343 /* done with this character => adjust input position */
8344 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008345 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008346
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008347 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008348 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008349 if (_PyBytes_Resize(&res, respos) < 0)
8350 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008351
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008352 Py_XDECREF(exc);
8353 Py_XDECREF(errorHandler);
8354 return res;
8355
Benjamin Peterson29060642009-01-31 22:14:21 +00008356 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008357 Py_XDECREF(res);
8358 Py_XDECREF(exc);
8359 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008360 return NULL;
8361}
8362
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008363/* Deprecated */
8364PyObject *
8365PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8366 Py_ssize_t size,
8367 PyObject *mapping,
8368 const char *errors)
8369{
8370 PyObject *result;
8371 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8372 if (unicode == NULL)
8373 return NULL;
8374 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8375 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008376 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008377}
8378
Alexander Belopolsky40018472011-02-26 01:02:56 +00008379PyObject *
8380PyUnicode_AsCharmapString(PyObject *unicode,
8381 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008382{
8383 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008384 PyErr_BadArgument();
8385 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008386 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008387 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008388}
8389
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008390/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008391static void
8392make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008393 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008394 Py_ssize_t startpos, Py_ssize_t endpos,
8395 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008396{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008397 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008398 *exceptionObject = _PyUnicodeTranslateError_Create(
8399 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008400 }
8401 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008402 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8403 goto onError;
8404 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8405 goto onError;
8406 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8407 goto onError;
8408 return;
8409 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02008410 Py_CLEAR(*exceptionObject);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008411 }
8412}
8413
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008414/* error handling callback helper:
8415 build arguments, call the callback and check the arguments,
8416 put the result into newpos and return the replacement string, which
8417 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008418static PyObject *
8419unicode_translate_call_errorhandler(const char *errors,
8420 PyObject **errorHandler,
8421 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008422 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008423 Py_ssize_t startpos, Py_ssize_t endpos,
8424 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008425{
Benjamin Peterson142957c2008-07-04 19:55:29 +00008426 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008427
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008428 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008429 PyObject *restuple;
8430 PyObject *resunicode;
8431
8432 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008433 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008434 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008435 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008436 }
8437
8438 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008439 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008440 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008441 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008442
8443 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00008444 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008445 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008446 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008447 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00008448 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008449 Py_DECREF(restuple);
8450 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008451 }
8452 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00008453 &resunicode, &i_newpos)) {
8454 Py_DECREF(restuple);
8455 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008456 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008457 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008458 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008459 else
8460 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008461 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008462 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
8463 Py_DECREF(restuple);
8464 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008465 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008466 Py_INCREF(resunicode);
8467 Py_DECREF(restuple);
8468 return resunicode;
8469}
8470
8471/* Lookup the character ch in the mapping and put the result in result,
8472 which must be decrefed by the caller.
8473 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008474static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008475charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008476{
Christian Heimes217cfd12007-12-02 14:31:20 +00008477 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008478 PyObject *x;
8479
8480 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008481 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008482 x = PyObject_GetItem(mapping, w);
8483 Py_DECREF(w);
8484 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008485 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8486 /* No mapping found means: use 1:1 mapping. */
8487 PyErr_Clear();
8488 *result = NULL;
8489 return 0;
8490 } else
8491 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008492 }
8493 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008494 *result = x;
8495 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008496 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008497 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008498 long value = PyLong_AS_LONG(x);
8499 long max = PyUnicode_GetMax();
8500 if (value < 0 || value > max) {
8501 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00008502 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008503 Py_DECREF(x);
8504 return -1;
8505 }
8506 *result = x;
8507 return 0;
8508 }
8509 else if (PyUnicode_Check(x)) {
8510 *result = x;
8511 return 0;
8512 }
8513 else {
8514 /* wrong return value */
8515 PyErr_SetString(PyExc_TypeError,
8516 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008517 Py_DECREF(x);
8518 return -1;
8519 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008520}
8521/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00008522 if not reallocate and adjust various state variables.
8523 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008524static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008525charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00008526 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008527{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008528 Py_ssize_t oldsize = *psize;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +00008529 Py_UCS4 *new_outobj;
Walter Dörwald4894c302003-10-24 14:25:28 +00008530 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008531 /* exponentially overallocate to minimize reallocations */
8532 if (requiredsize < 2 * oldsize)
8533 requiredsize = 2 * oldsize;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +00008534 new_outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
8535 if (new_outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008536 return -1;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +00008537 *outobj = new_outobj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008538 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008539 }
8540 return 0;
8541}
8542/* lookup the character, put the result in the output string and adjust
8543 various state variables. Return a new reference to the object that
8544 was put in the output buffer in *result, or Py_None, if the mapping was
8545 undefined (in which case no character was written).
8546 The called must decref result.
8547 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008548static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008549charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
8550 PyObject *mapping, Py_UCS4 **output,
8551 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008552 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008553{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008554 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
8555 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00008556 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008557 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008558 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008559 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008560 }
8561 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008562 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00008563 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008564 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008565 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008566 }
8567 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008568 Py_ssize_t repsize;
8569 if (PyUnicode_READY(*res) == -1)
8570 return -1;
8571 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00008572 if (repsize==1) {
8573 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008574 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00008575 }
8576 else if (repsize!=0) {
8577 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008578 Py_ssize_t requiredsize = *opos +
8579 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00008580 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008581 Py_ssize_t i;
8582 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00008583 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008584 for(i = 0; i < repsize; i++)
8585 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00008586 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008587 }
8588 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008589 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008590 return 0;
8591}
8592
Alexander Belopolsky40018472011-02-26 01:02:56 +00008593PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008594_PyUnicode_TranslateCharmap(PyObject *input,
8595 PyObject *mapping,
8596 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008597{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008598 /* input object */
8599 char *idata;
8600 Py_ssize_t size, i;
8601 int kind;
8602 /* output buffer */
8603 Py_UCS4 *output = NULL;
8604 Py_ssize_t osize;
8605 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008606 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008607 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008608 char *reason = "character maps to <undefined>";
8609 PyObject *errorHandler = NULL;
8610 PyObject *exc = NULL;
8611 /* the following variable is used for caching string comparisons
8612 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8613 * 3=ignore, 4=xmlcharrefreplace */
8614 int known_errorHandler = -1;
8615
Guido van Rossumd57fd912000-03-10 22:53:23 +00008616 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008617 PyErr_BadArgument();
8618 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008619 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008620
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008621 if (PyUnicode_READY(input) == -1)
8622 return NULL;
8623 idata = (char*)PyUnicode_DATA(input);
8624 kind = PyUnicode_KIND(input);
8625 size = PyUnicode_GET_LENGTH(input);
8626 i = 0;
8627
8628 if (size == 0) {
8629 Py_INCREF(input);
8630 return input;
8631 }
8632
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008633 /* allocate enough for a simple 1:1 translation without
8634 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008635 osize = size;
8636 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
8637 opos = 0;
8638 if (output == NULL) {
8639 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00008640 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008641 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008642
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008643 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008644 /* try to encode it */
8645 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008646 if (charmaptranslate_output(input, i, mapping,
8647 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008648 Py_XDECREF(x);
8649 goto onError;
8650 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008651 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00008652 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008653 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00008654 else { /* untranslatable character */
8655 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8656 Py_ssize_t repsize;
8657 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008658 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00008659 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008660 Py_ssize_t collstart = i;
8661 Py_ssize_t collend = i+1;
8662 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008663
Benjamin Peterson29060642009-01-31 22:14:21 +00008664 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008665 while (collend < size) {
8666 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008667 goto onError;
8668 Py_XDECREF(x);
8669 if (x!=Py_None)
8670 break;
8671 ++collend;
8672 }
8673 /* cache callback name lookup
8674 * (if not done yet, i.e. it's the first error) */
8675 if (known_errorHandler==-1) {
8676 if ((errors==NULL) || (!strcmp(errors, "strict")))
8677 known_errorHandler = 1;
8678 else if (!strcmp(errors, "replace"))
8679 known_errorHandler = 2;
8680 else if (!strcmp(errors, "ignore"))
8681 known_errorHandler = 3;
8682 else if (!strcmp(errors, "xmlcharrefreplace"))
8683 known_errorHandler = 4;
8684 else
8685 known_errorHandler = 0;
8686 }
8687 switch (known_errorHandler) {
8688 case 1: /* strict */
Victor Stinner6fa62752012-10-23 02:51:50 +02008689 make_translate_exception(&exc,
8690 input, collstart, collend, reason);
8691 if (exc != NULL)
8692 PyCodec_StrictErrors(exc);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008693 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00008694 case 2: /* replace */
8695 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008696 for (coll = collstart; coll<collend; coll++)
8697 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00008698 /* fall through */
8699 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008700 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008701 break;
8702 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008703 /* generate replacement (temporarily (mis)uses i) */
8704 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008705 char buffer[2+29+1+1];
8706 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008707 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
8708 if (charmaptranslate_makespace(&output, &osize,
8709 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00008710 goto onError;
8711 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008712 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00008713 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008714 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008715 break;
8716 default:
8717 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008718 reason, input, &exc,
8719 collstart, collend, &newpos);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008720 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008721 goto onError;
Benjamin Peterson9ca3ffa2012-01-01 16:04:29 -06008722 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008723 Py_DECREF(repunicode);
8724 goto onError;
8725 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008726 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008727 repsize = PyUnicode_GET_LENGTH(repunicode);
8728 if (charmaptranslate_makespace(&output, &osize,
8729 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008730 Py_DECREF(repunicode);
8731 goto onError;
8732 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008733 for (uni2 = 0; repsize-->0; ++uni2)
8734 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
8735 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00008736 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008737 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008738 }
8739 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008740 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
8741 if (!res)
8742 goto onError;
8743 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008744 Py_XDECREF(exc);
8745 Py_XDECREF(errorHandler);
8746 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008747
Benjamin Peterson29060642009-01-31 22:14:21 +00008748 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008749 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008750 Py_XDECREF(exc);
8751 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008752 return NULL;
8753}
8754
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008755/* Deprecated. Use PyUnicode_Translate instead. */
8756PyObject *
8757PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8758 Py_ssize_t size,
8759 PyObject *mapping,
8760 const char *errors)
8761{
Christian Heimes5f520f42012-09-11 14:03:25 +02008762 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008763 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8764 if (!unicode)
8765 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02008766 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8767 Py_DECREF(unicode);
8768 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008769}
8770
Alexander Belopolsky40018472011-02-26 01:02:56 +00008771PyObject *
8772PyUnicode_Translate(PyObject *str,
8773 PyObject *mapping,
8774 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008775{
8776 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008777
Guido van Rossumd57fd912000-03-10 22:53:23 +00008778 str = PyUnicode_FromObject(str);
8779 if (str == NULL)
Christian Heimes5f520f42012-09-11 14:03:25 +02008780 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008781 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008782 Py_DECREF(str);
8783 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008784}
Tim Petersced69f82003-09-16 20:30:58 +00008785
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008786static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008787fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008788{
8789 /* No need to call PyUnicode_READY(self) because this function is only
8790 called as a callback from fixup() which does it already. */
8791 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8792 const int kind = PyUnicode_KIND(self);
8793 void *data = PyUnicode_DATA(self);
Victor Stinnere6abb482012-05-02 01:15:40 +02008794 Py_UCS4 maxchar = 127, ch, fixed;
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008795 int modified = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008796 Py_ssize_t i;
8797
8798 for (i = 0; i < len; ++i) {
8799 ch = PyUnicode_READ(kind, data, i);
8800 fixed = 0;
8801 if (ch > 127) {
8802 if (Py_UNICODE_ISSPACE(ch))
8803 fixed = ' ';
8804 else {
8805 const int decimal = Py_UNICODE_TODECIMAL(ch);
8806 if (decimal >= 0)
8807 fixed = '0' + decimal;
8808 }
8809 if (fixed != 0) {
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008810 modified = 1;
Benjamin Peterson7e303732013-06-10 09:19:46 -07008811 maxchar = Py_MAX(maxchar, fixed);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008812 PyUnicode_WRITE(kind, data, i, fixed);
8813 }
Victor Stinnere6abb482012-05-02 01:15:40 +02008814 else
Benjamin Peterson7e303732013-06-10 09:19:46 -07008815 maxchar = Py_MAX(maxchar, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008816 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008817 }
8818
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008819 return (modified) ? maxchar : 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008820}
8821
8822PyObject *
8823_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8824{
8825 if (!PyUnicode_Check(unicode)) {
8826 PyErr_BadInternalCall();
8827 return NULL;
8828 }
8829 if (PyUnicode_READY(unicode) == -1)
8830 return NULL;
8831 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8832 /* If the string is already ASCII, just return the same string */
8833 Py_INCREF(unicode);
8834 return unicode;
8835 }
Victor Stinner9310abb2011-10-05 00:59:23 +02008836 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008837}
8838
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008839PyObject *
8840PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8841 Py_ssize_t length)
8842{
Victor Stinnerf0124502011-11-21 23:12:56 +01008843 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008844 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01008845 Py_UCS4 maxchar;
8846 enum PyUnicode_Kind kind;
8847 void *data;
8848
Victor Stinner99d7ad02012-02-22 13:37:39 +01008849 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008850 for (i = 0; i < length; i++) {
Victor Stinnerf0124502011-11-21 23:12:56 +01008851 Py_UNICODE ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008852 if (ch > 127) {
8853 int decimal = Py_UNICODE_TODECIMAL(ch);
8854 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01008855 ch = '0' + decimal;
Benjamin Peterson7e303732013-06-10 09:19:46 -07008856 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008857 }
8858 }
Victor Stinnerf0124502011-11-21 23:12:56 +01008859
8860 /* Copy to a new string */
8861 decimal = PyUnicode_New(length, maxchar);
8862 if (decimal == NULL)
8863 return decimal;
8864 kind = PyUnicode_KIND(decimal);
8865 data = PyUnicode_DATA(decimal);
8866 /* Iterate over code points */
8867 for (i = 0; i < length; i++) {
8868 Py_UNICODE ch = s[i];
8869 if (ch > 127) {
8870 int decimal = Py_UNICODE_TODECIMAL(ch);
8871 if (decimal >= 0)
8872 ch = '0' + decimal;
8873 }
8874 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008875 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008876 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008877}
Guido van Rossum9e896b32000-04-05 20:11:21 +00008878/* --- Decimal Encoder ---------------------------------------------------- */
8879
Alexander Belopolsky40018472011-02-26 01:02:56 +00008880int
8881PyUnicode_EncodeDecimal(Py_UNICODE *s,
8882 Py_ssize_t length,
8883 char *output,
8884 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00008885{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008886 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01008887 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01008888 enum PyUnicode_Kind kind;
8889 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008890
8891 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008892 PyErr_BadArgument();
8893 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008894 }
8895
Victor Stinner42bf7752011-11-21 22:52:58 +01008896 unicode = PyUnicode_FromUnicode(s, length);
8897 if (unicode == NULL)
8898 return -1;
8899
Benjamin Petersonbac79492012-01-14 13:34:47 -05008900 if (PyUnicode_READY(unicode) == -1) {
Victor Stinner6345be92011-11-25 20:09:01 +01008901 Py_DECREF(unicode);
8902 return -1;
8903 }
Victor Stinner42bf7752011-11-21 22:52:58 +01008904 kind = PyUnicode_KIND(unicode);
8905 data = PyUnicode_DATA(unicode);
8906
Victor Stinnerb84d7232011-11-22 01:50:07 +01008907 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01008908 PyObject *exc;
8909 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00008910 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01008911 Py_ssize_t startpos;
8912
8913 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00008914
Benjamin Peterson29060642009-01-31 22:14:21 +00008915 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008916 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01008917 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008918 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008919 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008920 decimal = Py_UNICODE_TODECIMAL(ch);
8921 if (decimal >= 0) {
8922 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008923 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008924 continue;
8925 }
8926 if (0 < ch && ch < 256) {
8927 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008928 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008929 continue;
8930 }
Victor Stinner6345be92011-11-25 20:09:01 +01008931
Victor Stinner42bf7752011-11-21 22:52:58 +01008932 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01008933 exc = NULL;
8934 raise_encode_exception(&exc, "decimal", unicode,
8935 startpos, startpos+1,
8936 "invalid decimal Unicode string");
8937 Py_XDECREF(exc);
8938 Py_DECREF(unicode);
8939 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008940 }
8941 /* 0-terminate the output string */
8942 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01008943 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008944 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008945}
8946
Guido van Rossumd57fd912000-03-10 22:53:23 +00008947/* --- Helpers ------------------------------------------------------------ */
8948
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008949static Py_ssize_t
Victor Stinner794d5672011-10-10 03:21:36 +02008950any_find_slice(int direction, PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008951 Py_ssize_t start,
8952 Py_ssize_t end)
8953{
8954 int kind1, kind2, kind;
8955 void *buf1, *buf2;
8956 Py_ssize_t len1, len2, result;
8957
8958 kind1 = PyUnicode_KIND(s1);
8959 kind2 = PyUnicode_KIND(s2);
8960 kind = kind1 > kind2 ? kind1 : kind2;
8961 buf1 = PyUnicode_DATA(s1);
8962 buf2 = PyUnicode_DATA(s2);
8963 if (kind1 != kind)
8964 buf1 = _PyUnicode_AsKind(s1, kind);
8965 if (!buf1)
8966 return -2;
8967 if (kind2 != kind)
8968 buf2 = _PyUnicode_AsKind(s2, kind);
8969 if (!buf2) {
8970 if (kind1 != kind) PyMem_Free(buf1);
8971 return -2;
8972 }
8973 len1 = PyUnicode_GET_LENGTH(s1);
8974 len2 = PyUnicode_GET_LENGTH(s2);
8975
Victor Stinner794d5672011-10-10 03:21:36 +02008976 if (direction > 0) {
Benjamin Petersonead6b532011-12-20 17:23:42 -06008977 switch (kind) {
Victor Stinner794d5672011-10-10 03:21:36 +02008978 case PyUnicode_1BYTE_KIND:
8979 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8980 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
8981 else
8982 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
8983 break;
8984 case PyUnicode_2BYTE_KIND:
8985 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
8986 break;
8987 case PyUnicode_4BYTE_KIND:
8988 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
8989 break;
8990 default:
8991 assert(0); result = -2;
8992 }
8993 }
8994 else {
Benjamin Petersonead6b532011-12-20 17:23:42 -06008995 switch (kind) {
Victor Stinner794d5672011-10-10 03:21:36 +02008996 case PyUnicode_1BYTE_KIND:
8997 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8998 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
8999 else
9000 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9001 break;
9002 case PyUnicode_2BYTE_KIND:
9003 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9004 break;
9005 case PyUnicode_4BYTE_KIND:
9006 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9007 break;
9008 default:
9009 assert(0); result = -2;
9010 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009011 }
9012
9013 if (kind1 != kind)
9014 PyMem_Free(buf1);
9015 if (kind2 != kind)
9016 PyMem_Free(buf2);
9017
9018 return result;
9019}
9020
9021Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01009022_PyUnicode_InsertThousandsGrouping(
9023 PyObject *unicode, Py_ssize_t index,
9024 Py_ssize_t n_buffer,
9025 void *digits, Py_ssize_t n_digits,
9026 Py_ssize_t min_width,
9027 const char *grouping, PyObject *thousands_sep,
9028 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009029{
Victor Stinner41a863c2012-02-24 00:37:51 +01009030 unsigned int kind, thousands_sep_kind;
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009031 char *data, *thousands_sep_data;
Victor Stinner41a863c2012-02-24 00:37:51 +01009032 Py_ssize_t thousands_sep_len;
9033 Py_ssize_t len;
9034
9035 if (unicode != NULL) {
9036 kind = PyUnicode_KIND(unicode);
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009037 data = (char *) PyUnicode_DATA(unicode) + index * kind;
Victor Stinner41a863c2012-02-24 00:37:51 +01009038 }
9039 else {
9040 kind = PyUnicode_1BYTE_KIND;
9041 data = NULL;
9042 }
9043 thousands_sep_kind = PyUnicode_KIND(thousands_sep);
9044 thousands_sep_data = PyUnicode_DATA(thousands_sep);
9045 thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9046 if (unicode != NULL && thousands_sep_kind != kind) {
Victor Stinner90f50d42012-02-24 01:44:47 +01009047 if (thousands_sep_kind < kind) {
9048 thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind);
9049 if (!thousands_sep_data)
9050 return -1;
9051 }
9052 else {
9053 data = _PyUnicode_AsKind(unicode, thousands_sep_kind);
9054 if (!data)
9055 return -1;
9056 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009057 }
9058
Benjamin Petersonead6b532011-12-20 17:23:42 -06009059 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009060 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009061 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
Victor Stinner41a863c2012-02-24 00:37:51 +01009062 len = asciilib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009063 (Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009064 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009065 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009066 else
Victor Stinner41a863c2012-02-24 00:37:51 +01009067 len = ucs1lib_InsertThousandsGrouping(
Victor Stinnerc3cec782011-10-05 21:24:08 +02009068 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009069 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009070 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009071 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009072 case PyUnicode_2BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009073 len = ucs2lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009074 (Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009075 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009076 (Py_UCS2 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009077 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009078 case PyUnicode_4BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009079 len = ucs4lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009080 (Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009081 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009082 (Py_UCS4 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009083 break;
9084 default:
9085 assert(0);
9086 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009087 }
Victor Stinner90f50d42012-02-24 01:44:47 +01009088 if (unicode != NULL && thousands_sep_kind != kind) {
9089 if (thousands_sep_kind < kind)
9090 PyMem_Free(thousands_sep_data);
9091 else
9092 PyMem_Free(data);
9093 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009094 if (unicode == NULL) {
9095 *maxchar = 127;
9096 if (len != n_digits) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009097 *maxchar = Py_MAX(*maxchar,
Victor Stinnere6abb482012-05-02 01:15:40 +02009098 PyUnicode_MAX_CHAR_VALUE(thousands_sep));
Victor Stinner41a863c2012-02-24 00:37:51 +01009099 }
9100 }
9101 return len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009102}
9103
9104
Thomas Wouters477c8d52006-05-27 19:21:47 +00009105/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009106#define ADJUST_INDICES(start, end, len) \
9107 if (end > len) \
9108 end = len; \
9109 else if (end < 0) { \
9110 end += len; \
9111 if (end < 0) \
9112 end = 0; \
9113 } \
9114 if (start < 0) { \
9115 start += len; \
9116 if (start < 0) \
9117 start = 0; \
9118 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009119
Alexander Belopolsky40018472011-02-26 01:02:56 +00009120Py_ssize_t
9121PyUnicode_Count(PyObject *str,
9122 PyObject *substr,
9123 Py_ssize_t start,
9124 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009125{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009126 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009127 PyObject* str_obj;
9128 PyObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009129 int kind1, kind2, kind;
9130 void *buf1 = NULL, *buf2 = NULL;
9131 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009132
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009133 str_obj = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009134 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00009135 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009136 sub_obj = PyUnicode_FromObject(substr);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009137 if (!sub_obj) {
9138 Py_DECREF(str_obj);
9139 return -1;
9140 }
Benjamin Peterson4c13a4a2012-01-02 09:07:38 -06009141 if (PyUnicode_READY(sub_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
Benjamin Peterson5e458f52012-01-02 10:12:13 -06009142 Py_DECREF(sub_obj);
Benjamin Peterson29060642009-01-31 22:14:21 +00009143 Py_DECREF(str_obj);
9144 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009145 }
Tim Petersced69f82003-09-16 20:30:58 +00009146
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009147 kind1 = PyUnicode_KIND(str_obj);
9148 kind2 = PyUnicode_KIND(sub_obj);
Antoine Pitroue45c0c52012-05-12 15:49:07 +02009149 kind = kind1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009150 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009151 buf2 = PyUnicode_DATA(sub_obj);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -05009152 if (kind2 != kind) {
Antoine Pitrou758153b2012-05-12 15:51:51 +02009153 if (kind2 > kind) {
9154 Py_DECREF(sub_obj);
9155 Py_DECREF(str_obj);
Antoine Pitroue45c0c52012-05-12 15:49:07 +02009156 return 0;
Antoine Pitrou758153b2012-05-12 15:51:51 +02009157 }
Victor Stinner7931d9a2011-11-04 00:22:48 +01009158 buf2 = _PyUnicode_AsKind(sub_obj, kind);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -05009159 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009160 if (!buf2)
9161 goto onError;
9162 len1 = PyUnicode_GET_LENGTH(str_obj);
9163 len2 = PyUnicode_GET_LENGTH(sub_obj);
9164
9165 ADJUST_INDICES(start, end, len1);
Benjamin Petersonead6b532011-12-20 17:23:42 -06009166 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009167 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009168 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
9169 result = asciilib_count(
9170 ((Py_UCS1*)buf1) + start, end - start,
9171 buf2, len2, PY_SSIZE_T_MAX
9172 );
9173 else
9174 result = ucs1lib_count(
9175 ((Py_UCS1*)buf1) + start, end - start,
9176 buf2, len2, PY_SSIZE_T_MAX
9177 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009178 break;
9179 case PyUnicode_2BYTE_KIND:
9180 result = ucs2lib_count(
9181 ((Py_UCS2*)buf1) + start, end - start,
9182 buf2, len2, PY_SSIZE_T_MAX
9183 );
9184 break;
9185 case PyUnicode_4BYTE_KIND:
9186 result = ucs4lib_count(
9187 ((Py_UCS4*)buf1) + start, end - start,
9188 buf2, len2, PY_SSIZE_T_MAX
9189 );
9190 break;
9191 default:
9192 assert(0); result = 0;
9193 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009194
9195 Py_DECREF(sub_obj);
9196 Py_DECREF(str_obj);
9197
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009198 if (kind2 != kind)
9199 PyMem_Free(buf2);
9200
Guido van Rossumd57fd912000-03-10 22:53:23 +00009201 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009202 onError:
9203 Py_DECREF(sub_obj);
9204 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009205 if (kind2 != kind && buf2)
9206 PyMem_Free(buf2);
9207 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009208}
9209
Alexander Belopolsky40018472011-02-26 01:02:56 +00009210Py_ssize_t
9211PyUnicode_Find(PyObject *str,
9212 PyObject *sub,
9213 Py_ssize_t start,
9214 Py_ssize_t end,
9215 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009216{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009217 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009218
Guido van Rossumd57fd912000-03-10 22:53:23 +00009219 str = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009220 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00009221 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009222 sub = PyUnicode_FromObject(sub);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009223 if (!sub) {
9224 Py_DECREF(str);
9225 return -2;
9226 }
9227 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
9228 Py_DECREF(sub);
Benjamin Peterson29060642009-01-31 22:14:21 +00009229 Py_DECREF(str);
9230 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009231 }
Tim Petersced69f82003-09-16 20:30:58 +00009232
Victor Stinner794d5672011-10-10 03:21:36 +02009233 result = any_find_slice(direction,
9234 str, sub, start, end
9235 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00009236
Guido van Rossumd57fd912000-03-10 22:53:23 +00009237 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009238 Py_DECREF(sub);
9239
Guido van Rossumd57fd912000-03-10 22:53:23 +00009240 return result;
9241}
9242
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009243Py_ssize_t
9244PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9245 Py_ssize_t start, Py_ssize_t end,
9246 int direction)
9247{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009248 int kind;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009249 Py_ssize_t result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009250 if (PyUnicode_READY(str) == -1)
9251 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02009252 if (start < 0 || end < 0) {
9253 PyErr_SetString(PyExc_IndexError, "string index out of range");
9254 return -2;
9255 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009256 if (end > PyUnicode_GET_LENGTH(str))
9257 end = PyUnicode_GET_LENGTH(str);
9258 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009259 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9260 kind, end-start, ch, direction);
9261 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009262 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009263 else
9264 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009265}
9266
Alexander Belopolsky40018472011-02-26 01:02:56 +00009267static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009268tailmatch(PyObject *self,
9269 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009270 Py_ssize_t start,
9271 Py_ssize_t end,
9272 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009273{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009274 int kind_self;
9275 int kind_sub;
9276 void *data_self;
9277 void *data_sub;
9278 Py_ssize_t offset;
9279 Py_ssize_t i;
9280 Py_ssize_t end_sub;
9281
9282 if (PyUnicode_READY(self) == -1 ||
9283 PyUnicode_READY(substring) == -1)
Victor Stinner18aa4472013-01-03 03:18:09 +01009284 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009285
9286 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009287 return 1;
9288
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009289 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9290 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009291 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009292 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009293
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009294 kind_self = PyUnicode_KIND(self);
9295 data_self = PyUnicode_DATA(self);
9296 kind_sub = PyUnicode_KIND(substring);
9297 data_sub = PyUnicode_DATA(substring);
9298 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9299
9300 if (direction > 0)
9301 offset = end;
9302 else
9303 offset = start;
9304
9305 if (PyUnicode_READ(kind_self, data_self, offset) ==
9306 PyUnicode_READ(kind_sub, data_sub, 0) &&
9307 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9308 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9309 /* If both are of the same kind, memcmp is sufficient */
9310 if (kind_self == kind_sub) {
9311 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009312 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009313 data_sub,
9314 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009315 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009316 }
9317 /* otherwise we have to compare each character by first accesing it */
9318 else {
9319 /* We do not need to compare 0 and len(substring)-1 because
9320 the if statement above ensured already that they are equal
9321 when we end up here. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009322 for (i = 1; i < end_sub; ++i) {
9323 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9324 PyUnicode_READ(kind_sub, data_sub, i))
9325 return 0;
9326 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009327 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009328 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009329 }
9330
9331 return 0;
9332}
9333
Alexander Belopolsky40018472011-02-26 01:02:56 +00009334Py_ssize_t
9335PyUnicode_Tailmatch(PyObject *str,
9336 PyObject *substr,
9337 Py_ssize_t start,
9338 Py_ssize_t end,
9339 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009340{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009341 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009342
Guido van Rossumd57fd912000-03-10 22:53:23 +00009343 str = PyUnicode_FromObject(str);
9344 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009345 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009346 substr = PyUnicode_FromObject(substr);
9347 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009348 Py_DECREF(str);
9349 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009350 }
Tim Petersced69f82003-09-16 20:30:58 +00009351
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009352 result = tailmatch(str, substr,
Benjamin Peterson29060642009-01-31 22:14:21 +00009353 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009354 Py_DECREF(str);
9355 Py_DECREF(substr);
9356 return result;
9357}
9358
Guido van Rossumd57fd912000-03-10 22:53:23 +00009359/* Apply fixfct filter to the Unicode object self and return a
9360 reference to the modified object */
9361
Alexander Belopolsky40018472011-02-26 01:02:56 +00009362static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009363fixup(PyObject *self,
9364 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009365{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009366 PyObject *u;
9367 Py_UCS4 maxchar_old, maxchar_new = 0;
Victor Stinnereaab6042011-12-11 22:22:39 +01009368 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009369
Victor Stinnerbf6e5602011-12-12 01:53:47 +01009370 u = _PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009371 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009372 return NULL;
Victor Stinner87af4f22011-11-21 23:03:47 +01009373 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009374
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009375 /* fix functions return the new maximum character in a string,
9376 if the kind of the resulting unicode object does not change,
9377 everything is fine. Otherwise we need to change the string kind
9378 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009379 maxchar_new = fixfct(u);
Victor Stinnereaab6042011-12-11 22:22:39 +01009380
9381 if (maxchar_new == 0) {
9382 /* no changes */;
9383 if (PyUnicode_CheckExact(self)) {
9384 Py_DECREF(u);
9385 Py_INCREF(self);
9386 return self;
9387 }
9388 else
9389 return u;
9390 }
9391
Victor Stinnere6abb482012-05-02 01:15:40 +02009392 maxchar_new = align_maxchar(maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009393
Victor Stinnereaab6042011-12-11 22:22:39 +01009394 if (maxchar_new == maxchar_old)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009395 return u;
Victor Stinnereaab6042011-12-11 22:22:39 +01009396
9397 /* In case the maximum character changed, we need to
9398 convert the string to the new category. */
9399 v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
9400 if (v == NULL) {
9401 Py_DECREF(u);
9402 return NULL;
9403 }
9404 if (maxchar_new > maxchar_old) {
9405 /* If the maxchar increased so that the kind changed, not all
9406 characters are representable anymore and we need to fix the
9407 string again. This only happens in very few cases. */
Victor Stinnerd3f08822012-05-29 12:57:52 +02009408 _PyUnicode_FastCopyCharacters(v, 0,
9409 self, 0, PyUnicode_GET_LENGTH(self));
Victor Stinnereaab6042011-12-11 22:22:39 +01009410 maxchar_old = fixfct(v);
9411 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009412 }
9413 else {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009414 _PyUnicode_FastCopyCharacters(v, 0,
9415 u, 0, PyUnicode_GET_LENGTH(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009416 }
Victor Stinnereaab6042011-12-11 22:22:39 +01009417 Py_DECREF(u);
9418 assert(_PyUnicode_CheckConsistency(v, 1));
9419 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009420}
9421
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009422static PyObject *
9423ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009424{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009425 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9426 char *resdata, *data = PyUnicode_DATA(self);
9427 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009428
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009429 res = PyUnicode_New(len, 127);
9430 if (res == NULL)
9431 return NULL;
9432 resdata = PyUnicode_DATA(res);
9433 if (lower)
9434 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009435 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009436 _Py_bytes_upper(resdata, data, len);
9437 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009438}
9439
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009440static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009441handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009442{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009443 Py_ssize_t j;
9444 int final_sigma;
9445 Py_UCS4 c;
9446 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009447
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009448 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9449
9450 where ! is a negation and \p{xxx} is a character with property xxx.
9451 */
9452 for (j = i - 1; j >= 0; j--) {
9453 c = PyUnicode_READ(kind, data, j);
9454 if (!_PyUnicode_IsCaseIgnorable(c))
9455 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009456 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009457 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9458 if (final_sigma) {
9459 for (j = i + 1; j < length; j++) {
9460 c = PyUnicode_READ(kind, data, j);
9461 if (!_PyUnicode_IsCaseIgnorable(c))
9462 break;
9463 }
9464 final_sigma = j == length || !_PyUnicode_IsCased(c);
9465 }
9466 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009467}
9468
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009469static int
9470lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9471 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009472{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009473 /* Obscure special case. */
9474 if (c == 0x3A3) {
9475 mapped[0] = handle_capital_sigma(kind, data, length, i);
9476 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009477 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009478 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009479}
9480
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009481static Py_ssize_t
9482do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009483{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009484 Py_ssize_t i, k = 0;
9485 int n_res, j;
9486 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009487
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009488 c = PyUnicode_READ(kind, data, 0);
9489 n_res = _PyUnicode_ToUpperFull(c, mapped);
9490 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009491 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009492 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009493 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009494 for (i = 1; i < length; i++) {
9495 c = PyUnicode_READ(kind, data, i);
9496 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9497 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009498 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009499 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009500 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009501 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009502 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009503}
9504
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009505static Py_ssize_t
9506do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9507 Py_ssize_t i, k = 0;
9508
9509 for (i = 0; i < length; i++) {
9510 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9511 int n_res, j;
9512 if (Py_UNICODE_ISUPPER(c)) {
9513 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9514 }
9515 else if (Py_UNICODE_ISLOWER(c)) {
9516 n_res = _PyUnicode_ToUpperFull(c, mapped);
9517 }
9518 else {
9519 n_res = 1;
9520 mapped[0] = c;
9521 }
9522 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009523 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009524 res[k++] = mapped[j];
9525 }
9526 }
9527 return k;
9528}
9529
9530static Py_ssize_t
9531do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9532 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009533{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009534 Py_ssize_t i, k = 0;
9535
9536 for (i = 0; i < length; i++) {
9537 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9538 int n_res, j;
9539 if (lower)
9540 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9541 else
9542 n_res = _PyUnicode_ToUpperFull(c, mapped);
9543 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009544 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009545 res[k++] = mapped[j];
9546 }
9547 }
9548 return k;
9549}
9550
9551static Py_ssize_t
9552do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9553{
9554 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9555}
9556
9557static Py_ssize_t
9558do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9559{
9560 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9561}
9562
Benjamin Petersone51757f2012-01-12 21:10:29 -05009563static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009564do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9565{
9566 Py_ssize_t i, k = 0;
9567
9568 for (i = 0; i < length; i++) {
9569 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9570 Py_UCS4 mapped[3];
9571 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9572 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009573 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -05009574 res[k++] = mapped[j];
9575 }
9576 }
9577 return k;
9578}
9579
9580static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009581do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9582{
9583 Py_ssize_t i, k = 0;
9584 int previous_is_cased;
9585
9586 previous_is_cased = 0;
9587 for (i = 0; i < length; i++) {
9588 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9589 Py_UCS4 mapped[3];
9590 int n_res, j;
9591
9592 if (previous_is_cased)
9593 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9594 else
9595 n_res = _PyUnicode_ToTitleFull(c, mapped);
9596
9597 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009598 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -05009599 res[k++] = mapped[j];
9600 }
9601
9602 previous_is_cased = _PyUnicode_IsCased(c);
9603 }
9604 return k;
9605}
9606
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009607static PyObject *
9608case_operation(PyObject *self,
9609 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9610{
9611 PyObject *res = NULL;
9612 Py_ssize_t length, newlength = 0;
9613 int kind, outkind;
9614 void *data, *outdata;
9615 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9616
Benjamin Petersoneea48462012-01-16 14:28:50 -05009617 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009618
9619 kind = PyUnicode_KIND(self);
9620 data = PyUnicode_DATA(self);
9621 length = PyUnicode_GET_LENGTH(self);
9622 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
9623 if (tmp == NULL)
9624 return PyErr_NoMemory();
9625 newlength = perform(kind, data, length, tmp, &maxchar);
9626 res = PyUnicode_New(newlength, maxchar);
9627 if (res == NULL)
9628 goto leave;
9629 tmpend = tmp + newlength;
9630 outdata = PyUnicode_DATA(res);
9631 outkind = PyUnicode_KIND(res);
9632 switch (outkind) {
9633 case PyUnicode_1BYTE_KIND:
9634 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9635 break;
9636 case PyUnicode_2BYTE_KIND:
9637 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9638 break;
9639 case PyUnicode_4BYTE_KIND:
9640 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9641 break;
9642 default:
9643 assert(0);
9644 break;
9645 }
9646 leave:
9647 PyMem_FREE(tmp);
9648 return res;
9649}
9650
Tim Peters8ce9f162004-08-27 01:49:32 +00009651PyObject *
9652PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009653{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009654 PyObject *sep = NULL;
Victor Stinnerdd077322011-10-07 17:02:31 +02009655 Py_ssize_t seplen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009656 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00009657 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009658 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
9659 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00009660 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009661 Py_ssize_t sz, i, res_offset;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009662 Py_UCS4 maxchar;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009663 Py_UCS4 item_maxchar;
Victor Stinnerdd077322011-10-07 17:02:31 +02009664 int use_memcpy;
9665 unsigned char *res_data = NULL, *sep_data = NULL;
9666 PyObject *last_obj;
9667 unsigned int kind = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009668
Tim Peters05eba1f2004-08-27 21:32:02 +00009669 fseq = PySequence_Fast(seq, "");
9670 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009671 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009672 }
9673
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009674 /* NOTE: the following code can't call back into Python code,
9675 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009676 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009677
Tim Peters05eba1f2004-08-27 21:32:02 +00009678 seqlen = PySequence_Fast_GET_SIZE(fseq);
9679 /* If empty sequence, return u"". */
9680 if (seqlen == 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009681 Py_DECREF(fseq);
Serhiy Storchaka678db842013-01-26 12:16:36 +02009682 _Py_RETURN_UNICODE_EMPTY();
Tim Peters05eba1f2004-08-27 21:32:02 +00009683 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009684
Tim Peters05eba1f2004-08-27 21:32:02 +00009685 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009686 last_obj = NULL;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009687 items = PySequence_Fast_ITEMS(fseq);
Victor Stinneracf47b82011-10-06 12:32:37 +02009688 if (seqlen == 1) {
9689 if (PyUnicode_CheckExact(items[0])) {
9690 res = items[0];
9691 Py_INCREF(res);
9692 Py_DECREF(fseq);
9693 return res;
9694 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009695 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009696 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009697 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009698 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009699 /* Set up sep and seplen */
9700 if (separator == NULL) {
9701 /* fall back to a blank space separator */
9702 sep = PyUnicode_FromOrdinal(' ');
9703 if (!sep)
9704 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009705 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009706 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009707 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009708 else {
9709 if (!PyUnicode_Check(separator)) {
9710 PyErr_Format(PyExc_TypeError,
9711 "separator: expected str instance,"
9712 " %.80s found",
9713 Py_TYPE(separator)->tp_name);
9714 goto onError;
9715 }
9716 if (PyUnicode_READY(separator))
9717 goto onError;
9718 sep = separator;
9719 seplen = PyUnicode_GET_LENGTH(separator);
9720 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9721 /* inc refcount to keep this code path symmetric with the
9722 above case of a blank separator */
9723 Py_INCREF(sep);
9724 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009725 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009726 }
9727
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009728 /* There are at least two things to join, or else we have a subclass
9729 * of str in the sequence.
9730 * Do a pre-pass to figure out the total amount of space we'll
9731 * need (sz), and see whether all argument are strings.
9732 */
9733 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009734#ifdef Py_DEBUG
9735 use_memcpy = 0;
9736#else
9737 use_memcpy = 1;
9738#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009739 for (i = 0; i < seqlen; i++) {
9740 const Py_ssize_t old_sz = sz;
9741 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009742 if (!PyUnicode_Check(item)) {
9743 PyErr_Format(PyExc_TypeError,
9744 "sequence item %zd: expected str instance,"
9745 " %.80s found",
9746 i, Py_TYPE(item)->tp_name);
9747 goto onError;
9748 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009749 if (PyUnicode_READY(item) == -1)
9750 goto onError;
9751 sz += PyUnicode_GET_LENGTH(item);
9752 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Benjamin Peterson7e303732013-06-10 09:19:46 -07009753 maxchar = Py_MAX(maxchar, item_maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009754 if (i != 0)
9755 sz += seplen;
9756 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9757 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009758 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009759 goto onError;
9760 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009761 if (use_memcpy && last_obj != NULL) {
9762 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9763 use_memcpy = 0;
9764 }
9765 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009766 }
Tim Petersced69f82003-09-16 20:30:58 +00009767
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009768 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009769 if (res == NULL)
9770 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009771
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009772 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009773#ifdef Py_DEBUG
9774 use_memcpy = 0;
9775#else
9776 if (use_memcpy) {
9777 res_data = PyUnicode_1BYTE_DATA(res);
9778 kind = PyUnicode_KIND(res);
9779 if (seplen != 0)
9780 sep_data = PyUnicode_1BYTE_DATA(sep);
9781 }
9782#endif
Victor Stinner4560f9c2013-04-14 18:56:46 +02009783 if (use_memcpy) {
9784 for (i = 0; i < seqlen; ++i) {
9785 Py_ssize_t itemlen;
9786 item = items[i];
9787
9788 /* Copy item, and maybe the separator. */
9789 if (i && seplen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009790 Py_MEMCPY(res_data,
9791 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009792 kind * seplen);
9793 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009794 }
Victor Stinner4560f9c2013-04-14 18:56:46 +02009795
9796 itemlen = PyUnicode_GET_LENGTH(item);
9797 if (itemlen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009798 Py_MEMCPY(res_data,
9799 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009800 kind * itemlen);
9801 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009802 }
Victor Stinner4560f9c2013-04-14 18:56:46 +02009803 }
9804 assert(res_data == PyUnicode_1BYTE_DATA(res)
9805 + kind * PyUnicode_GET_LENGTH(res));
9806 }
9807 else {
9808 for (i = 0, res_offset = 0; i < seqlen; ++i) {
9809 Py_ssize_t itemlen;
9810 item = items[i];
9811
9812 /* Copy item, and maybe the separator. */
9813 if (i && seplen != 0) {
9814 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
9815 res_offset += seplen;
9816 }
9817
9818 itemlen = PyUnicode_GET_LENGTH(item);
9819 if (itemlen != 0) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009820 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +02009821 res_offset += itemlen;
9822 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009823 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009824 assert(res_offset == PyUnicode_GET_LENGTH(res));
Victor Stinner4560f9c2013-04-14 18:56:46 +02009825 }
Tim Peters8ce9f162004-08-27 01:49:32 +00009826
Tim Peters05eba1f2004-08-27 21:32:02 +00009827 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009828 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009829 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009830 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009831
Benjamin Peterson29060642009-01-31 22:14:21 +00009832 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00009833 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009834 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00009835 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009836 return NULL;
9837}
9838
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009839#define FILL(kind, data, value, start, length) \
9840 do { \
9841 Py_ssize_t i_ = 0; \
9842 assert(kind != PyUnicode_WCHAR_KIND); \
9843 switch ((kind)) { \
9844 case PyUnicode_1BYTE_KIND: { \
9845 unsigned char * to_ = (unsigned char *)((data)) + (start); \
Victor Stinnerf2c76aa2012-05-03 13:10:40 +02009846 memset(to_, (unsigned char)value, (length)); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009847 break; \
9848 } \
9849 case PyUnicode_2BYTE_KIND: { \
9850 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
9851 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9852 break; \
9853 } \
Benjamin Petersone157cf12012-01-01 15:56:20 -06009854 case PyUnicode_4BYTE_KIND: { \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009855 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9856 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9857 break; \
Benjamin Petersone157cf12012-01-01 15:56:20 -06009858 default: assert(0); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009859 } \
9860 } \
9861 } while (0)
9862
Victor Stinnerd3f08822012-05-29 12:57:52 +02009863void
9864_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9865 Py_UCS4 fill_char)
9866{
9867 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
9868 const void *data = PyUnicode_DATA(unicode);
9869 assert(PyUnicode_IS_READY(unicode));
9870 assert(unicode_modifiable(unicode));
9871 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
9872 assert(start >= 0);
9873 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
9874 FILL(kind, data, fill_char, start, length);
9875}
9876
Victor Stinner3fe55312012-01-04 00:33:50 +01009877Py_ssize_t
9878PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9879 Py_UCS4 fill_char)
9880{
9881 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +01009882
9883 if (!PyUnicode_Check(unicode)) {
9884 PyErr_BadInternalCall();
9885 return -1;
9886 }
9887 if (PyUnicode_READY(unicode) == -1)
9888 return -1;
9889 if (unicode_check_modifiable(unicode))
9890 return -1;
9891
Victor Stinnerd3f08822012-05-29 12:57:52 +02009892 if (start < 0) {
9893 PyErr_SetString(PyExc_IndexError, "string index out of range");
9894 return -1;
9895 }
Victor Stinner3fe55312012-01-04 00:33:50 +01009896 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
9897 PyErr_SetString(PyExc_ValueError,
9898 "fill character is bigger than "
9899 "the string maximum character");
9900 return -1;
9901 }
9902
9903 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
9904 length = Py_MIN(maxlen, length);
9905 if (length <= 0)
9906 return 0;
9907
Victor Stinnerd3f08822012-05-29 12:57:52 +02009908 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +01009909 return length;
9910}
9911
Victor Stinner9310abb2011-10-05 00:59:23 +02009912static PyObject *
9913pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009914 Py_ssize_t left,
9915 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009916 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009917{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009918 PyObject *u;
9919 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009920 int kind;
9921 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009922
9923 if (left < 0)
9924 left = 0;
9925 if (right < 0)
9926 right = 0;
9927
Victor Stinnerc4b49542011-12-11 22:44:26 +01009928 if (left == 0 && right == 0)
9929 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009930
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009931 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9932 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00009933 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9934 return NULL;
9935 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009936 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Benjamin Peterson7e303732013-06-10 09:19:46 -07009937 maxchar = Py_MAX(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009938 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009939 if (!u)
9940 return NULL;
9941
9942 kind = PyUnicode_KIND(u);
9943 data = PyUnicode_DATA(u);
9944 if (left)
9945 FILL(kind, data, fill, 0, left);
9946 if (right)
9947 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +02009948 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009949 assert(_PyUnicode_CheckConsistency(u, 1));
9950 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009951}
9952
Alexander Belopolsky40018472011-02-26 01:02:56 +00009953PyObject *
9954PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009955{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009956 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009957
9958 string = PyUnicode_FromObject(string);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009959 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009960 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -06009961 if (PyUnicode_READY(string) == -1) {
9962 Py_DECREF(string);
9963 return NULL;
9964 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009965
Benjamin Petersonead6b532011-12-20 17:23:42 -06009966 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009967 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009968 if (PyUnicode_IS_ASCII(string))
9969 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009970 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009971 PyUnicode_GET_LENGTH(string), keepends);
9972 else
9973 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009974 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009975 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009976 break;
9977 case PyUnicode_2BYTE_KIND:
9978 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009979 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009980 PyUnicode_GET_LENGTH(string), keepends);
9981 break;
9982 case PyUnicode_4BYTE_KIND:
9983 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009984 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009985 PyUnicode_GET_LENGTH(string), keepends);
9986 break;
9987 default:
9988 assert(0);
9989 list = 0;
9990 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009991 Py_DECREF(string);
9992 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009993}
9994
Alexander Belopolsky40018472011-02-26 01:02:56 +00009995static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009996split(PyObject *self,
9997 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009998 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009999{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010000 int kind1, kind2, kind;
10001 void *buf1, *buf2;
10002 Py_ssize_t len1, len2;
10003 PyObject* out;
10004
Guido van Rossumd57fd912000-03-10 22:53:23 +000010005 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010006 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010007
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010008 if (PyUnicode_READY(self) == -1)
10009 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010010
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010011 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010012 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010013 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010014 if (PyUnicode_IS_ASCII(self))
10015 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010016 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010017 PyUnicode_GET_LENGTH(self), maxcount
10018 );
10019 else
10020 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010021 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010022 PyUnicode_GET_LENGTH(self), maxcount
10023 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010024 case PyUnicode_2BYTE_KIND:
10025 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010026 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010027 PyUnicode_GET_LENGTH(self), maxcount
10028 );
10029 case PyUnicode_4BYTE_KIND:
10030 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010031 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010032 PyUnicode_GET_LENGTH(self), maxcount
10033 );
10034 default:
10035 assert(0);
10036 return NULL;
10037 }
10038
10039 if (PyUnicode_READY(substring) == -1)
10040 return NULL;
10041
10042 kind1 = PyUnicode_KIND(self);
10043 kind2 = PyUnicode_KIND(substring);
10044 kind = kind1 > kind2 ? kind1 : kind2;
10045 buf1 = PyUnicode_DATA(self);
10046 buf2 = PyUnicode_DATA(substring);
10047 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010048 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010049 if (!buf1)
10050 return NULL;
10051 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010052 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010053 if (!buf2) {
10054 if (kind1 != kind) PyMem_Free(buf1);
10055 return NULL;
10056 }
10057 len1 = PyUnicode_GET_LENGTH(self);
10058 len2 = PyUnicode_GET_LENGTH(substring);
10059
Benjamin Petersonead6b532011-12-20 17:23:42 -060010060 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010061 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010062 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10063 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010064 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010065 else
10066 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010067 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010068 break;
10069 case PyUnicode_2BYTE_KIND:
10070 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010071 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010072 break;
10073 case PyUnicode_4BYTE_KIND:
10074 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010075 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010076 break;
10077 default:
10078 out = NULL;
10079 }
10080 if (kind1 != kind)
10081 PyMem_Free(buf1);
10082 if (kind2 != kind)
10083 PyMem_Free(buf2);
10084 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010085}
10086
Alexander Belopolsky40018472011-02-26 01:02:56 +000010087static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010088rsplit(PyObject *self,
10089 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010090 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010091{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010092 int kind1, kind2, kind;
10093 void *buf1, *buf2;
10094 Py_ssize_t len1, len2;
10095 PyObject* out;
10096
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010097 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010098 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010099
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010100 if (PyUnicode_READY(self) == -1)
10101 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010102
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010103 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010104 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010105 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010106 if (PyUnicode_IS_ASCII(self))
10107 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010108 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010109 PyUnicode_GET_LENGTH(self), maxcount
10110 );
10111 else
10112 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010113 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010114 PyUnicode_GET_LENGTH(self), maxcount
10115 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010116 case PyUnicode_2BYTE_KIND:
10117 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010118 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010119 PyUnicode_GET_LENGTH(self), maxcount
10120 );
10121 case PyUnicode_4BYTE_KIND:
10122 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010123 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010124 PyUnicode_GET_LENGTH(self), maxcount
10125 );
10126 default:
10127 assert(0);
10128 return NULL;
10129 }
10130
10131 if (PyUnicode_READY(substring) == -1)
10132 return NULL;
10133
10134 kind1 = PyUnicode_KIND(self);
10135 kind2 = PyUnicode_KIND(substring);
10136 kind = kind1 > kind2 ? kind1 : kind2;
10137 buf1 = PyUnicode_DATA(self);
10138 buf2 = PyUnicode_DATA(substring);
10139 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010140 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010141 if (!buf1)
10142 return NULL;
10143 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010144 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010145 if (!buf2) {
10146 if (kind1 != kind) PyMem_Free(buf1);
10147 return NULL;
10148 }
10149 len1 = PyUnicode_GET_LENGTH(self);
10150 len2 = PyUnicode_GET_LENGTH(substring);
10151
Benjamin Petersonead6b532011-12-20 17:23:42 -060010152 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010153 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010154 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10155 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010156 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010157 else
10158 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010159 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010160 break;
10161 case PyUnicode_2BYTE_KIND:
10162 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010163 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010164 break;
10165 case PyUnicode_4BYTE_KIND:
10166 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010167 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010168 break;
10169 default:
10170 out = NULL;
10171 }
10172 if (kind1 != kind)
10173 PyMem_Free(buf1);
10174 if (kind2 != kind)
10175 PyMem_Free(buf2);
10176 return out;
10177}
10178
10179static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010180anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10181 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010182{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010183 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010184 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010185 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10186 return asciilib_find(buf1, len1, buf2, len2, offset);
10187 else
10188 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010189 case PyUnicode_2BYTE_KIND:
10190 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10191 case PyUnicode_4BYTE_KIND:
10192 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10193 }
10194 assert(0);
10195 return -1;
10196}
10197
10198static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010199anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10200 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010201{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010202 switch (kind) {
10203 case PyUnicode_1BYTE_KIND:
10204 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10205 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10206 else
10207 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10208 case PyUnicode_2BYTE_KIND:
10209 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10210 case PyUnicode_4BYTE_KIND:
10211 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10212 }
10213 assert(0);
10214 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010215}
10216
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010217static void
10218replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10219 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10220{
10221 int kind = PyUnicode_KIND(u);
10222 void *data = PyUnicode_DATA(u);
10223 Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10224 if (kind == PyUnicode_1BYTE_KIND) {
10225 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10226 (Py_UCS1 *)data + len,
10227 u1, u2, maxcount);
10228 }
10229 else if (kind == PyUnicode_2BYTE_KIND) {
10230 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10231 (Py_UCS2 *)data + len,
10232 u1, u2, maxcount);
10233 }
10234 else {
10235 assert(kind == PyUnicode_4BYTE_KIND);
10236 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10237 (Py_UCS4 *)data + len,
10238 u1, u2, maxcount);
10239 }
10240}
10241
Alexander Belopolsky40018472011-02-26 01:02:56 +000010242static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010243replace(PyObject *self, PyObject *str1,
10244 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010245{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010246 PyObject *u;
10247 char *sbuf = PyUnicode_DATA(self);
10248 char *buf1 = PyUnicode_DATA(str1);
10249 char *buf2 = PyUnicode_DATA(str2);
10250 int srelease = 0, release1 = 0, release2 = 0;
10251 int skind = PyUnicode_KIND(self);
10252 int kind1 = PyUnicode_KIND(str1);
10253 int kind2 = PyUnicode_KIND(str2);
10254 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10255 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10256 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010257 int mayshrink;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010258 Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010259
10260 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010261 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010262 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010263 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010264
Victor Stinner59de0ee2011-10-07 10:01:28 +020010265 if (str1 == str2)
10266 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010267
Victor Stinner49a0a212011-10-12 23:46:10 +020010268 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010269 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10270 if (maxchar < maxchar_str1)
10271 /* substring too wide to be present */
10272 goto nothing;
Victor Stinner49a0a212011-10-12 23:46:10 +020010273 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10274 /* Replacing str1 with str2 may cause a maxchar reduction in the
10275 result string. */
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010276 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010277 maxchar = Py_MAX(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010278
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010279 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010280 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010281 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010282 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010283 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010284 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010285 Py_UCS4 u1, u2;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010286 Py_ssize_t pos;
Victor Stinnerf6441102011-12-18 02:43:08 +010010287
Victor Stinner69ed0f42013-04-09 21:48:24 +020010288 u1 = PyUnicode_READ(kind1, buf1, 0);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010289 pos = findchar(sbuf, skind, slen, u1, 1);
Victor Stinnerf6441102011-12-18 02:43:08 +010010290 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010291 goto nothing;
Victor Stinner69ed0f42013-04-09 21:48:24 +020010292 u2 = PyUnicode_READ(kind2, buf2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010293 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010294 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010295 goto error;
Victor Stinnerf6441102011-12-18 02:43:08 +010010296
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010297 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10298 replace_1char_inplace(u, pos, u1, u2, maxcount);
Victor Stinner49a0a212011-10-12 23:46:10 +020010299 }
10300 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010301 int rkind = skind;
10302 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010303 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010304
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010305 if (kind1 < rkind) {
10306 /* widen substring */
10307 buf1 = _PyUnicode_AsKind(str1, rkind);
10308 if (!buf1) goto error;
10309 release1 = 1;
10310 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010311 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010312 if (i < 0)
10313 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010314 if (rkind > kind2) {
10315 /* widen replacement */
10316 buf2 = _PyUnicode_AsKind(str2, rkind);
10317 if (!buf2) goto error;
10318 release2 = 1;
10319 }
10320 else if (rkind < kind2) {
10321 /* widen self and buf1 */
10322 rkind = kind2;
10323 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010324 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010325 sbuf = _PyUnicode_AsKind(self, rkind);
10326 if (!sbuf) goto error;
10327 srelease = 1;
10328 buf1 = _PyUnicode_AsKind(str1, rkind);
10329 if (!buf1) goto error;
10330 release1 = 1;
10331 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010332 u = PyUnicode_New(slen, maxchar);
10333 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010334 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010335 assert(PyUnicode_KIND(u) == rkind);
10336 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010337
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010338 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010339 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010340 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010341 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010342 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010343 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010344
10345 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010346 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010347 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010348 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010349 if (i == -1)
10350 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010351 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010352 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010353 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010354 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010355 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010356 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010357 }
10358 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010359 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010360 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010361 int rkind = skind;
10362 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010363
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010364 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010365 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010366 buf1 = _PyUnicode_AsKind(str1, rkind);
10367 if (!buf1) goto error;
10368 release1 = 1;
10369 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010370 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010371 if (n == 0)
10372 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010373 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010374 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010375 buf2 = _PyUnicode_AsKind(str2, rkind);
10376 if (!buf2) goto error;
10377 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010378 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010379 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010380 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010381 rkind = kind2;
10382 sbuf = _PyUnicode_AsKind(self, rkind);
10383 if (!sbuf) goto error;
10384 srelease = 1;
10385 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010386 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010387 buf1 = _PyUnicode_AsKind(str1, rkind);
10388 if (!buf1) goto error;
10389 release1 = 1;
10390 }
10391 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10392 PyUnicode_GET_LENGTH(str1))); */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010393 if (len2 > len1 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010394 PyErr_SetString(PyExc_OverflowError,
10395 "replace string is too long");
10396 goto error;
10397 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010398 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010399 if (new_size == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010400 _Py_INCREF_UNICODE_EMPTY();
10401 if (!unicode_empty)
10402 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010403 u = unicode_empty;
10404 goto done;
10405 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010406 if (new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010407 PyErr_SetString(PyExc_OverflowError,
10408 "replace string is too long");
10409 goto error;
10410 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010411 u = PyUnicode_New(new_size, maxchar);
10412 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010413 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010414 assert(PyUnicode_KIND(u) == rkind);
10415 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010416 ires = i = 0;
10417 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010418 while (n-- > 0) {
10419 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010420 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010421 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010422 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010423 if (j == -1)
10424 break;
10425 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010426 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010427 memcpy(res + rkind * ires,
10428 sbuf + rkind * i,
10429 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010430 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010431 }
10432 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010433 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010434 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010435 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010436 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010437 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010438 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010439 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010440 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010441 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010442 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010443 memcpy(res + rkind * ires,
10444 sbuf + rkind * i,
10445 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010446 }
10447 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010448 /* interleave */
10449 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010450 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010451 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010452 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010453 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010454 if (--n <= 0)
10455 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010456 memcpy(res + rkind * ires,
10457 sbuf + rkind * i,
10458 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010459 ires++;
10460 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010461 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010462 memcpy(res + rkind * ires,
10463 sbuf + rkind * i,
10464 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010465 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010466 }
10467
10468 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010469 unicode_adjust_maxchar(&u);
10470 if (u == NULL)
10471 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010472 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010473
10474 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010475 if (srelease)
10476 PyMem_FREE(sbuf);
10477 if (release1)
10478 PyMem_FREE(buf1);
10479 if (release2)
10480 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010481 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010482 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010483
Benjamin Peterson29060642009-01-31 22:14:21 +000010484 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010485 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010486 if (srelease)
10487 PyMem_FREE(sbuf);
10488 if (release1)
10489 PyMem_FREE(buf1);
10490 if (release2)
10491 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010492 return unicode_result_unchanged(self);
10493
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010494 error:
10495 if (srelease && sbuf)
10496 PyMem_FREE(sbuf);
10497 if (release1 && buf1)
10498 PyMem_FREE(buf1);
10499 if (release2 && buf2)
10500 PyMem_FREE(buf2);
10501 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010502}
10503
10504/* --- Unicode Object Methods --------------------------------------------- */
10505
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010506PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010507 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010508\n\
10509Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010510characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010511
10512static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010513unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010514{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010515 if (PyUnicode_READY(self) == -1)
10516 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010517 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010518}
10519
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010520PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010521 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010522\n\
10523Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +000010524have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010525
10526static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010527unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010528{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010529 if (PyUnicode_READY(self) == -1)
10530 return NULL;
10531 if (PyUnicode_GET_LENGTH(self) == 0)
10532 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010533 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010534}
10535
Benjamin Petersond5890c82012-01-14 13:23:30 -050010536PyDoc_STRVAR(casefold__doc__,
10537 "S.casefold() -> str\n\
10538\n\
10539Return a version of S suitable for caseless comparisons.");
10540
10541static PyObject *
10542unicode_casefold(PyObject *self)
10543{
10544 if (PyUnicode_READY(self) == -1)
10545 return NULL;
10546 if (PyUnicode_IS_ASCII(self))
10547 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010548 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010549}
10550
10551
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010552/* Argument converter. Coerces to a single unicode character */
10553
10554static int
10555convert_uc(PyObject *obj, void *addr)
10556{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010557 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010558 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010559
Benjamin Peterson14339b62009-01-31 16:36:08 +000010560 uniobj = PyUnicode_FromObject(obj);
10561 if (uniobj == NULL) {
10562 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010563 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010564 return 0;
10565 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010566 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010567 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010568 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010569 Py_DECREF(uniobj);
10570 return 0;
10571 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010572 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010573 Py_DECREF(uniobj);
10574 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010575}
10576
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010577PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010578 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010579\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010580Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010581done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010582
10583static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010584unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010585{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010586 Py_ssize_t marg, left;
10587 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010588 Py_UCS4 fillchar = ' ';
10589
Victor Stinnere9a29352011-10-01 02:14:59 +020010590 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010591 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010592
Benjamin Petersonbac79492012-01-14 13:34:47 -050010593 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010594 return NULL;
10595
Victor Stinnerc4b49542011-12-11 22:44:26 +010010596 if (PyUnicode_GET_LENGTH(self) >= width)
10597 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010598
Victor Stinnerc4b49542011-12-11 22:44:26 +010010599 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010600 left = marg / 2 + (marg & width & 1);
10601
Victor Stinner9310abb2011-10-05 00:59:23 +020010602 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010603}
10604
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010605/* This function assumes that str1 and str2 are readied by the caller. */
10606
Marc-André Lemburge5034372000-08-08 08:04:29 +000010607static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010608unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010609{
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010610#define COMPARE(TYPE1, TYPE2) \
10611 do { \
10612 TYPE1* p1 = (TYPE1 *)data1; \
10613 TYPE2* p2 = (TYPE2 *)data2; \
10614 TYPE1* end = p1 + len; \
10615 Py_UCS4 c1, c2; \
10616 for (; p1 != end; p1++, p2++) { \
10617 c1 = *p1; \
10618 c2 = *p2; \
10619 if (c1 != c2) \
10620 return (c1 < c2) ? -1 : 1; \
10621 } \
10622 } \
10623 while (0)
10624
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010625 int kind1, kind2;
10626 void *data1, *data2;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010627 Py_ssize_t len1, len2, len;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010628
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010629 kind1 = PyUnicode_KIND(str1);
10630 kind2 = PyUnicode_KIND(str2);
10631 data1 = PyUnicode_DATA(str1);
10632 data2 = PyUnicode_DATA(str2);
10633 len1 = PyUnicode_GET_LENGTH(str1);
10634 len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner770e19e2012-10-04 22:59:45 +020010635 len = Py_MIN(len1, len2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010636
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010637 switch(kind1) {
10638 case PyUnicode_1BYTE_KIND:
10639 {
10640 switch(kind2) {
10641 case PyUnicode_1BYTE_KIND:
10642 {
10643 int cmp = memcmp(data1, data2, len);
10644 /* normalize result of memcmp() into the range [-1; 1] */
10645 if (cmp < 0)
10646 return -1;
10647 if (cmp > 0)
10648 return 1;
10649 break;
Victor Stinner770e19e2012-10-04 22:59:45 +020010650 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010651 case PyUnicode_2BYTE_KIND:
10652 COMPARE(Py_UCS1, Py_UCS2);
10653 break;
10654 case PyUnicode_4BYTE_KIND:
10655 COMPARE(Py_UCS1, Py_UCS4);
10656 break;
10657 default:
10658 assert(0);
10659 }
10660 break;
10661 }
10662 case PyUnicode_2BYTE_KIND:
10663 {
10664 switch(kind2) {
10665 case PyUnicode_1BYTE_KIND:
10666 COMPARE(Py_UCS2, Py_UCS1);
10667 break;
10668 case PyUnicode_2BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010669 {
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010670 COMPARE(Py_UCS2, Py_UCS2);
10671 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010672 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010673 case PyUnicode_4BYTE_KIND:
10674 COMPARE(Py_UCS2, Py_UCS4);
10675 break;
10676 default:
10677 assert(0);
10678 }
10679 break;
10680 }
10681 case PyUnicode_4BYTE_KIND:
10682 {
10683 switch(kind2) {
10684 case PyUnicode_1BYTE_KIND:
10685 COMPARE(Py_UCS4, Py_UCS1);
10686 break;
10687 case PyUnicode_2BYTE_KIND:
10688 COMPARE(Py_UCS4, Py_UCS2);
10689 break;
10690 case PyUnicode_4BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010691 {
10692#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
10693 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
10694 /* normalize result of wmemcmp() into the range [-1; 1] */
10695 if (cmp < 0)
10696 return -1;
10697 if (cmp > 0)
10698 return 1;
10699#else
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010700 COMPARE(Py_UCS4, Py_UCS4);
Victor Stinnercd777ea2013-04-08 22:43:44 +020010701#endif
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010702 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010703 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010704 default:
10705 assert(0);
10706 }
10707 break;
10708 }
10709 default:
10710 assert(0);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010711 }
10712
Victor Stinner770e19e2012-10-04 22:59:45 +020010713 if (len1 == len2)
10714 return 0;
10715 if (len1 < len2)
10716 return -1;
10717 else
10718 return 1;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010719
10720#undef COMPARE
Marc-André Lemburge5034372000-08-08 08:04:29 +000010721}
10722
Victor Stinnerc8bc5372013-11-04 11:08:10 +010010723Py_LOCAL(int)
Victor Stinnere5567ad2012-10-23 02:48:49 +020010724unicode_compare_eq(PyObject *str1, PyObject *str2)
10725{
10726 int kind;
10727 void *data1, *data2;
10728 Py_ssize_t len;
10729 int cmp;
10730
Victor Stinnere5567ad2012-10-23 02:48:49 +020010731 len = PyUnicode_GET_LENGTH(str1);
10732 if (PyUnicode_GET_LENGTH(str2) != len)
10733 return 0;
10734 kind = PyUnicode_KIND(str1);
10735 if (PyUnicode_KIND(str2) != kind)
10736 return 0;
10737 data1 = PyUnicode_DATA(str1);
10738 data2 = PyUnicode_DATA(str2);
10739
10740 cmp = memcmp(data1, data2, len * kind);
10741 return (cmp == 0);
10742}
10743
10744
Alexander Belopolsky40018472011-02-26 01:02:56 +000010745int
10746PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010747{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010748 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10749 if (PyUnicode_READY(left) == -1 ||
10750 PyUnicode_READY(right) == -1)
10751 return -1;
Victor Stinnerf0c7b2a2013-11-04 11:27:14 +010010752
10753 /* a string is equal to itself */
10754 if (left == right)
10755 return 0;
10756
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010757 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010758 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010759 PyErr_Format(PyExc_TypeError,
10760 "Can't compare %.100s and %.100s",
10761 left->ob_type->tp_name,
10762 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010763 return -1;
10764}
10765
Martin v. Löwis5b222132007-06-10 09:51:05 +000010766int
Victor Stinnerad14ccd2013-11-07 00:46:04 +010010767_PyUnicode_CompareWithId(PyObject *left, _Py_Identifier *right)
10768{
10769 PyObject *right_str = _PyUnicode_FromId(right); /* borrowed */
10770 if (right_str == NULL)
10771 return -1;
10772 return PyUnicode_Compare(left, right_str);
10773}
10774
10775int
Martin v. Löwis5b222132007-06-10 09:51:05 +000010776PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10777{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010778 Py_ssize_t i;
10779 int kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010780 Py_UCS4 chr;
10781
Victor Stinner910337b2011-10-03 03:20:16 +020010782 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010783 if (PyUnicode_READY(uni) == -1)
10784 return -1;
10785 kind = PyUnicode_KIND(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010010786 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnera6b9b072013-10-30 18:27:13 +010010787 const void *data = PyUnicode_1BYTE_DATA(uni);
Victor Stinnere1b15922013-11-03 13:53:12 +010010788 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010010789 size_t len, len2 = strlen(str);
10790 int cmp;
10791
10792 len = Py_MIN(len1, len2);
10793 cmp = memcmp(data, str, len);
Victor Stinner21ea21e2013-11-04 11:28:26 +010010794 if (cmp != 0) {
10795 if (cmp < 0)
10796 return -1;
10797 else
10798 return 1;
10799 }
Victor Stinner602f7cf2013-10-29 23:31:50 +010010800 if (len1 > len2)
10801 return 1; /* uni is longer */
10802 if (len2 > len1)
10803 return -1; /* str is longer */
10804 return 0;
10805 }
10806 else {
10807 void *data = PyUnicode_DATA(uni);
10808 /* Compare Unicode string and source character set string */
10809 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
10810 if (chr != str[i])
10811 return (chr < (unsigned char)(str[i])) ? -1 : 1;
10812 /* This check keeps Python strings that end in '\0' from comparing equal
10813 to C strings identical up to that point. */
10814 if (PyUnicode_GET_LENGTH(uni) != i || chr)
10815 return 1; /* uni is longer */
10816 if (str[i])
10817 return -1; /* str is longer */
10818 return 0;
10819 }
Martin v. Löwis5b222132007-06-10 09:51:05 +000010820}
10821
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010822
Benjamin Peterson29060642009-01-31 22:14:21 +000010823#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000010824 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010825
Alexander Belopolsky40018472011-02-26 01:02:56 +000010826PyObject *
10827PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010828{
10829 int result;
Victor Stinnere5567ad2012-10-23 02:48:49 +020010830 PyObject *v;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010831
Victor Stinnere5567ad2012-10-23 02:48:49 +020010832 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
10833 Py_RETURN_NOTIMPLEMENTED;
10834
10835 if (PyUnicode_READY(left) == -1 ||
10836 PyUnicode_READY(right) == -1)
10837 return NULL;
10838
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010010839 if (left == right) {
10840 switch (op) {
10841 case Py_EQ:
10842 case Py_LE:
10843 case Py_GE:
10844 /* a string is equal to itself */
10845 v = Py_True;
10846 break;
10847 case Py_NE:
10848 case Py_LT:
10849 case Py_GT:
10850 v = Py_False;
10851 break;
10852 default:
10853 PyErr_BadArgument();
10854 return NULL;
10855 }
10856 }
10857 else if (op == Py_EQ || op == Py_NE) {
Victor Stinnere5567ad2012-10-23 02:48:49 +020010858 result = unicode_compare_eq(left, right);
Victor Stinnerc8bc5372013-11-04 11:08:10 +010010859 result ^= (op == Py_NE);
10860 v = TEST_COND(result);
Victor Stinnere5567ad2012-10-23 02:48:49 +020010861 }
10862 else {
Victor Stinner90db9c42012-10-04 21:53:50 +020010863 result = unicode_compare(left, right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010864
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010865 /* Convert the return value to a Boolean */
10866 switch (op) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010867 case Py_LE:
10868 v = TEST_COND(result <= 0);
10869 break;
10870 case Py_GE:
10871 v = TEST_COND(result >= 0);
10872 break;
10873 case Py_LT:
10874 v = TEST_COND(result == -1);
10875 break;
10876 case Py_GT:
10877 v = TEST_COND(result == 1);
10878 break;
10879 default:
10880 PyErr_BadArgument();
10881 return NULL;
10882 }
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010883 }
Victor Stinnere5567ad2012-10-23 02:48:49 +020010884 Py_INCREF(v);
10885 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010886}
10887
Alexander Belopolsky40018472011-02-26 01:02:56 +000010888int
10889PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +000010890{
Thomas Wouters477c8d52006-05-27 19:21:47 +000010891 PyObject *str, *sub;
Victor Stinner77282cb2013-04-14 19:22:47 +020010892 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010893 void *buf1, *buf2;
10894 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010895 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010896
10897 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +000010898 sub = PyUnicode_FromObject(element);
10899 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010900 PyErr_Format(PyExc_TypeError,
10901 "'in <string>' requires string as left operand, not %s",
10902 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010903 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010904 }
10905
Thomas Wouters477c8d52006-05-27 19:21:47 +000010906 str = PyUnicode_FromObject(container);
Benjamin Peterson22a29702012-01-02 09:00:30 -060010907 if (!str) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010908 Py_DECREF(sub);
10909 return -1;
10910 }
10911
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010912 kind1 = PyUnicode_KIND(str);
10913 kind2 = PyUnicode_KIND(sub);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010914 buf1 = PyUnicode_DATA(str);
10915 buf2 = PyUnicode_DATA(sub);
Victor Stinner77282cb2013-04-14 19:22:47 +020010916 if (kind2 != kind1) {
10917 if (kind2 > kind1) {
Antoine Pitrou758153b2012-05-12 15:51:51 +020010918 Py_DECREF(sub);
10919 Py_DECREF(str);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010920 return 0;
Antoine Pitrou758153b2012-05-12 15:51:51 +020010921 }
Victor Stinner77282cb2013-04-14 19:22:47 +020010922 buf2 = _PyUnicode_AsKind(sub, kind1);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010923 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010924 if (!buf2) {
10925 Py_DECREF(sub);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010926 Py_DECREF(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010927 return -1;
10928 }
10929 len1 = PyUnicode_GET_LENGTH(str);
10930 len2 = PyUnicode_GET_LENGTH(sub);
10931
Victor Stinner77282cb2013-04-14 19:22:47 +020010932 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010933 case PyUnicode_1BYTE_KIND:
10934 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10935 break;
10936 case PyUnicode_2BYTE_KIND:
10937 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10938 break;
10939 case PyUnicode_4BYTE_KIND:
10940 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
10941 break;
10942 default:
10943 result = -1;
10944 assert(0);
10945 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010946
10947 Py_DECREF(str);
10948 Py_DECREF(sub);
10949
Victor Stinner77282cb2013-04-14 19:22:47 +020010950 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010951 PyMem_Free(buf2);
10952
Guido van Rossum403d68b2000-03-13 15:55:09 +000010953 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010954}
10955
Guido van Rossumd57fd912000-03-10 22:53:23 +000010956/* Concat to string or Unicode object giving a new Unicode object. */
10957
Alexander Belopolsky40018472011-02-26 01:02:56 +000010958PyObject *
10959PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010960{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010961 PyObject *u = NULL, *v = NULL, *w;
Victor Stinner127226b2011-10-13 01:12:34 +020010962 Py_UCS4 maxchar, maxchar2;
Victor Stinner488fa492011-12-12 00:01:39 +010010963 Py_ssize_t u_len, v_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010964
10965 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010966 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010967 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010968 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010969 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010970 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010971 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010972
10973 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +020010974 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010975 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010976 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010977 }
Victor Stinnera464fc12011-10-02 20:39:30 +020010978 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010979 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010980 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010981 }
10982
Victor Stinner488fa492011-12-12 00:01:39 +010010983 u_len = PyUnicode_GET_LENGTH(u);
10984 v_len = PyUnicode_GET_LENGTH(v);
10985 if (u_len > PY_SSIZE_T_MAX - v_len) {
10986 PyErr_SetString(PyExc_OverflowError,
10987 "strings are too large to concat");
10988 goto onError;
10989 }
10990 new_len = u_len + v_len;
10991
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010992 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinner127226b2011-10-13 01:12:34 +020010993 maxchar2 = PyUnicode_MAX_CHAR_VALUE(v);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010994 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010995
Guido van Rossumd57fd912000-03-10 22:53:23 +000010996 /* Concat the two Unicode strings */
Victor Stinner488fa492011-12-12 00:01:39 +010010997 w = PyUnicode_New(new_len, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010998 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010999 goto onError;
Victor Stinnerd3f08822012-05-29 12:57:52 +020011000 _PyUnicode_FastCopyCharacters(w, 0, u, 0, u_len);
11001 _PyUnicode_FastCopyCharacters(w, u_len, v, 0, v_len);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011002 Py_DECREF(u);
11003 Py_DECREF(v);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011004 assert(_PyUnicode_CheckConsistency(w, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011005 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011006
Benjamin Peterson29060642009-01-31 22:14:21 +000011007 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000011008 Py_XDECREF(u);
11009 Py_XDECREF(v);
11010 return NULL;
11011}
11012
Walter Dörwald1ab83302007-05-18 17:15:44 +000011013void
Victor Stinner23e56682011-10-03 03:54:37 +020011014PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000011015{
Victor Stinner23e56682011-10-03 03:54:37 +020011016 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010011017 Py_UCS4 maxchar, maxchar2;
11018 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020011019
11020 if (p_left == NULL) {
11021 if (!PyErr_Occurred())
11022 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000011023 return;
11024 }
Victor Stinner23e56682011-10-03 03:54:37 +020011025 left = *p_left;
Victor Stinnerf0335102013-04-14 19:13:03 +020011026 if (right == NULL || left == NULL
11027 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
Victor Stinner23e56682011-10-03 03:54:37 +020011028 if (!PyErr_Occurred())
11029 PyErr_BadInternalCall();
11030 goto error;
11031 }
11032
Benjamin Petersonbac79492012-01-14 13:34:47 -050011033 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011034 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050011035 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011036 goto error;
11037
Victor Stinner488fa492011-12-12 00:01:39 +010011038 /* Shortcuts */
11039 if (left == unicode_empty) {
11040 Py_DECREF(left);
11041 Py_INCREF(right);
11042 *p_left = right;
11043 return;
11044 }
11045 if (right == unicode_empty)
11046 return;
11047
11048 left_len = PyUnicode_GET_LENGTH(left);
11049 right_len = PyUnicode_GET_LENGTH(right);
11050 if (left_len > PY_SSIZE_T_MAX - right_len) {
11051 PyErr_SetString(PyExc_OverflowError,
11052 "strings are too large to concat");
11053 goto error;
11054 }
11055 new_len = left_len + right_len;
11056
11057 if (unicode_modifiable(left)
11058 && PyUnicode_CheckExact(right)
11059 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020011060 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11061 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020011062 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020011063 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010011064 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11065 {
11066 /* append inplace */
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011067 if (unicode_resize(p_left, new_len) != 0)
Victor Stinner488fa492011-12-12 00:01:39 +010011068 goto error;
Victor Stinnerf0335102013-04-14 19:13:03 +020011069
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011070 /* copy 'right' into the newly allocated area of 'left' */
11071 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020011072 }
Victor Stinner488fa492011-12-12 00:01:39 +010011073 else {
11074 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11075 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011076 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020011077
Victor Stinner488fa492011-12-12 00:01:39 +010011078 /* Concat the two Unicode strings */
11079 res = PyUnicode_New(new_len, maxchar);
11080 if (res == NULL)
11081 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020011082 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11083 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010011084 Py_DECREF(left);
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011085 *p_left = res;
Victor Stinner488fa492011-12-12 00:01:39 +010011086 }
11087 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020011088 return;
11089
11090error:
Victor Stinner488fa492011-12-12 00:01:39 +010011091 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011092}
11093
11094void
11095PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11096{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011097 PyUnicode_Append(pleft, right);
11098 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011099}
11100
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011101PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011102 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011103\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011104Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011105string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011106interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011107
11108static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011109unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011110{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011111 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011112 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011113 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011114 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011115 int kind1, kind2, kind;
11116 void *buf1, *buf2;
11117 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011118
Jesus Ceaac451502011-04-20 17:09:23 +020011119 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
11120 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011121 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000011122
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011123 kind1 = PyUnicode_KIND(self);
11124 kind2 = PyUnicode_KIND(substring);
Christian Heimesd47802e2013-06-29 21:33:36 +020011125 if (kind2 > kind1) {
11126 Py_DECREF(substring);
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040011127 return PyLong_FromLong(0);
Christian Heimesd47802e2013-06-29 21:33:36 +020011128 }
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040011129 kind = kind1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011130 buf1 = PyUnicode_DATA(self);
11131 buf2 = PyUnicode_DATA(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011132 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010011133 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011134 if (!buf2) {
11135 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011136 return NULL;
11137 }
11138 len1 = PyUnicode_GET_LENGTH(self);
11139 len2 = PyUnicode_GET_LENGTH(substring);
11140
11141 ADJUST_INDICES(start, end, len1);
Benjamin Petersonead6b532011-12-20 17:23:42 -060011142 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011143 case PyUnicode_1BYTE_KIND:
11144 iresult = ucs1lib_count(
11145 ((Py_UCS1*)buf1) + start, end - start,
11146 buf2, len2, PY_SSIZE_T_MAX
11147 );
11148 break;
11149 case PyUnicode_2BYTE_KIND:
11150 iresult = ucs2lib_count(
11151 ((Py_UCS2*)buf1) + start, end - start,
11152 buf2, len2, PY_SSIZE_T_MAX
11153 );
11154 break;
11155 case PyUnicode_4BYTE_KIND:
11156 iresult = ucs4lib_count(
11157 ((Py_UCS4*)buf1) + start, end - start,
11158 buf2, len2, PY_SSIZE_T_MAX
11159 );
11160 break;
11161 default:
11162 assert(0); iresult = 0;
11163 }
11164
11165 result = PyLong_FromSsize_t(iresult);
11166
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011167 if (kind2 != kind)
11168 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011169
11170 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011171
Guido van Rossumd57fd912000-03-10 22:53:23 +000011172 return result;
11173}
11174
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011175PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000011176 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011177\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000011178Encode S using the codec registered for encoding. Default encoding\n\
11179is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000011180handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000011181a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
11182'xmlcharrefreplace' as well as any other name registered with\n\
11183codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011184
11185static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011186unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011187{
Benjamin Peterson308d6372009-09-18 21:42:35 +000011188 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000011189 char *encoding = NULL;
11190 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000011191
Benjamin Peterson308d6372009-09-18 21:42:35 +000011192 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
11193 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011194 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011195 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011196}
11197
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011198PyDoc_STRVAR(expandtabs__doc__,
Ezio Melotti745d54d2013-11-16 19:10:57 +020011199 "S.expandtabs(tabsize=8) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011200\n\
11201Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011202If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011203
11204static PyObject*
Ezio Melotti745d54d2013-11-16 19:10:57 +020011205unicode_expandtabs(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011206{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011207 Py_ssize_t i, j, line_pos, src_len, incr;
11208 Py_UCS4 ch;
11209 PyObject *u;
11210 void *src_data, *dest_data;
Ezio Melotti745d54d2013-11-16 19:10:57 +020011211 static char *kwlist[] = {"tabsize", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000011212 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011213 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011214 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011215
Ezio Melotti745d54d2013-11-16 19:10:57 +020011216 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:expandtabs",
11217 kwlist, &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000011218 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011219
Antoine Pitrou22425222011-10-04 19:10:51 +020011220 if (PyUnicode_READY(self) == -1)
11221 return NULL;
11222
Thomas Wouters7e474022000-07-16 12:04:32 +000011223 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011224 src_len = PyUnicode_GET_LENGTH(self);
11225 i = j = line_pos = 0;
11226 kind = PyUnicode_KIND(self);
11227 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011228 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011229 for (; i < src_len; i++) {
11230 ch = PyUnicode_READ(kind, src_data, i);
11231 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011232 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011233 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011234 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011235 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011236 goto overflow;
11237 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011238 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011239 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011240 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011241 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011242 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011243 goto overflow;
11244 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011245 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011246 if (ch == '\n' || ch == '\r')
11247 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011248 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011249 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011250 if (!found)
11251 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011252
Guido van Rossumd57fd912000-03-10 22:53:23 +000011253 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011254 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011255 if (!u)
11256 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011257 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011258
Antoine Pitroue71d5742011-10-04 15:55:09 +020011259 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011260
Antoine Pitroue71d5742011-10-04 15:55:09 +020011261 for (; i < src_len; i++) {
11262 ch = PyUnicode_READ(kind, src_data, i);
11263 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011264 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011265 incr = tabsize - (line_pos % tabsize);
11266 line_pos += incr;
Victor Stinnerda79e632012-02-22 13:37:04 +010011267 FILL(kind, dest_data, ' ', j, incr);
11268 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011269 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011270 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011271 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011272 line_pos++;
11273 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011274 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011275 if (ch == '\n' || ch == '\r')
11276 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011277 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011278 }
11279 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011280 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011281
Antoine Pitroue71d5742011-10-04 15:55:09 +020011282 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011283 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11284 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011285}
11286
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011287PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011288 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011289\n\
11290Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011291such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011292arguments start and end are interpreted as in slice notation.\n\
11293\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011294Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011295
11296static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011297unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011298{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011299 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011300 Py_ssize_t start;
11301 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011302 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011303
Jesus Ceaac451502011-04-20 17:09:23 +020011304 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
11305 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011306 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011307
Christian Heimesd47802e2013-06-29 21:33:36 +020011308 if (PyUnicode_READY(self) == -1) {
11309 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011310 return NULL;
Christian Heimesd47802e2013-06-29 21:33:36 +020011311 }
11312 if (PyUnicode_READY(substring) == -1) {
11313 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011314 return NULL;
Christian Heimesd47802e2013-06-29 21:33:36 +020011315 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011316
Victor Stinner7931d9a2011-11-04 00:22:48 +010011317 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011318
11319 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011320
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011321 if (result == -2)
11322 return NULL;
11323
Christian Heimes217cfd12007-12-02 14:31:20 +000011324 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011325}
11326
11327static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011328unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011329{
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011330 void *data;
11331 enum PyUnicode_Kind kind;
11332 Py_UCS4 ch;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011333
11334 if (!PyUnicode_Check(self) || PyUnicode_READY(self) == -1) {
11335 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011336 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011337 }
11338 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11339 PyErr_SetString(PyExc_IndexError, "string index out of range");
11340 return NULL;
11341 }
11342 kind = PyUnicode_KIND(self);
11343 data = PyUnicode_DATA(self);
11344 ch = PyUnicode_READ(kind, data, index);
Victor Stinner985a82a2014-01-03 12:53:47 +010011345 return unicode_char(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011346}
11347
Guido van Rossumc2504932007-09-18 19:42:40 +000011348/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011349 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011350static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011351unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011352{
Guido van Rossumc2504932007-09-18 19:42:40 +000011353 Py_ssize_t len;
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080011354 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000011355
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011356#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011357 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011358#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011359 if (_PyUnicode_HASH(self) != -1)
11360 return _PyUnicode_HASH(self);
11361 if (PyUnicode_READY(self) == -1)
11362 return -1;
11363 len = PyUnicode_GET_LENGTH(self);
Georg Brandl16fa2a12012-02-21 00:50:13 +010011364 /*
11365 We make the hash of the empty string be 0, rather than using
11366 (prefix ^ suffix), since this slightly obfuscates the hash secret
11367 */
11368 if (len == 0) {
11369 _PyUnicode_HASH(self) = 0;
11370 return 0;
11371 }
Christian Heimes985ecdc2013-11-20 11:46:18 +010011372 x = _Py_HashBytes(PyUnicode_DATA(self),
11373 PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011374 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011375 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011376}
11377
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011378PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011379 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011380\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011381Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011382
11383static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011384unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011385{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011386 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011387 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011388 Py_ssize_t start;
11389 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011390
Jesus Ceaac451502011-04-20 17:09:23 +020011391 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
11392 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011393 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011394
Christian Heimesd47a0452013-06-29 21:21:37 +020011395 if (PyUnicode_READY(self) == -1) {
11396 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011397 return NULL;
Christian Heimesd47a0452013-06-29 21:21:37 +020011398 }
11399 if (PyUnicode_READY(substring) == -1) {
11400 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011401 return NULL;
Christian Heimesd47a0452013-06-29 21:21:37 +020011402 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011403
Victor Stinner7931d9a2011-11-04 00:22:48 +010011404 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011405
11406 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011407
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011408 if (result == -2)
11409 return NULL;
11410
Guido van Rossumd57fd912000-03-10 22:53:23 +000011411 if (result < 0) {
11412 PyErr_SetString(PyExc_ValueError, "substring not found");
11413 return NULL;
11414 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011415
Christian Heimes217cfd12007-12-02 14:31:20 +000011416 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011417}
11418
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011419PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011420 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011421\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011422Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011423at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011424
11425static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011426unicode_islower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011427{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011428 Py_ssize_t i, length;
11429 int kind;
11430 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011431 int cased;
11432
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011433 if (PyUnicode_READY(self) == -1)
11434 return NULL;
11435 length = PyUnicode_GET_LENGTH(self);
11436 kind = PyUnicode_KIND(self);
11437 data = PyUnicode_DATA(self);
11438
Guido van Rossumd57fd912000-03-10 22:53:23 +000011439 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011440 if (length == 1)
11441 return PyBool_FromLong(
11442 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011443
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011444 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011445 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011446 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011447
Guido van Rossumd57fd912000-03-10 22:53:23 +000011448 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011449 for (i = 0; i < length; i++) {
11450 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011451
Benjamin Peterson29060642009-01-31 22:14:21 +000011452 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11453 return PyBool_FromLong(0);
11454 else if (!cased && Py_UNICODE_ISLOWER(ch))
11455 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011456 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011457 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011458}
11459
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011460PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011461 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011462\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011463Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011464at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011465
11466static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011467unicode_isupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011468{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011469 Py_ssize_t i, length;
11470 int kind;
11471 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011472 int cased;
11473
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011474 if (PyUnicode_READY(self) == -1)
11475 return NULL;
11476 length = PyUnicode_GET_LENGTH(self);
11477 kind = PyUnicode_KIND(self);
11478 data = PyUnicode_DATA(self);
11479
Guido van Rossumd57fd912000-03-10 22:53:23 +000011480 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011481 if (length == 1)
11482 return PyBool_FromLong(
11483 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011484
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011485 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011486 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011487 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011488
Guido van Rossumd57fd912000-03-10 22:53:23 +000011489 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011490 for (i = 0; i < length; i++) {
11491 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011492
Benjamin Peterson29060642009-01-31 22:14:21 +000011493 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11494 return PyBool_FromLong(0);
11495 else if (!cased && Py_UNICODE_ISUPPER(ch))
11496 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011497 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011498 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011499}
11500
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011501PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011502 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011503\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011504Return True if S is a titlecased string and there is at least one\n\
11505character in S, i.e. upper- and titlecase characters may only\n\
11506follow uncased characters and lowercase characters only cased ones.\n\
11507Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011508
11509static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011510unicode_istitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011511{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011512 Py_ssize_t i, length;
11513 int kind;
11514 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011515 int cased, previous_is_cased;
11516
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011517 if (PyUnicode_READY(self) == -1)
11518 return NULL;
11519 length = PyUnicode_GET_LENGTH(self);
11520 kind = PyUnicode_KIND(self);
11521 data = PyUnicode_DATA(self);
11522
Guido van Rossumd57fd912000-03-10 22:53:23 +000011523 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011524 if (length == 1) {
11525 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11526 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11527 (Py_UNICODE_ISUPPER(ch) != 0));
11528 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011529
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011530 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011531 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011532 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011533
Guido van Rossumd57fd912000-03-10 22:53:23 +000011534 cased = 0;
11535 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011536 for (i = 0; i < length; i++) {
11537 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011538
Benjamin Peterson29060642009-01-31 22:14:21 +000011539 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11540 if (previous_is_cased)
11541 return PyBool_FromLong(0);
11542 previous_is_cased = 1;
11543 cased = 1;
11544 }
11545 else if (Py_UNICODE_ISLOWER(ch)) {
11546 if (!previous_is_cased)
11547 return PyBool_FromLong(0);
11548 previous_is_cased = 1;
11549 cased = 1;
11550 }
11551 else
11552 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011553 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011554 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011555}
11556
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011557PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011558 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011559\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011560Return True if all characters in S are whitespace\n\
11561and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011562
11563static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011564unicode_isspace(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011565{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011566 Py_ssize_t i, length;
11567 int kind;
11568 void *data;
11569
11570 if (PyUnicode_READY(self) == -1)
11571 return NULL;
11572 length = PyUnicode_GET_LENGTH(self);
11573 kind = PyUnicode_KIND(self);
11574 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011575
Guido van Rossumd57fd912000-03-10 22:53:23 +000011576 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011577 if (length == 1)
11578 return PyBool_FromLong(
11579 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011580
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011581 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011582 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011583 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011584
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011585 for (i = 0; i < length; i++) {
11586 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011587 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011588 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011589 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011590 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011591}
11592
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011593PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011594 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011595\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011596Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011597and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011598
11599static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011600unicode_isalpha(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011601{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011602 Py_ssize_t i, length;
11603 int kind;
11604 void *data;
11605
11606 if (PyUnicode_READY(self) == -1)
11607 return NULL;
11608 length = PyUnicode_GET_LENGTH(self);
11609 kind = PyUnicode_KIND(self);
11610 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011611
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011612 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011613 if (length == 1)
11614 return PyBool_FromLong(
11615 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011616
11617 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011618 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011619 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011620
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011621 for (i = 0; i < length; i++) {
11622 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011623 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011624 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011625 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011626}
11627
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011628PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011629 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011630\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011631Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011632and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011633
11634static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011635unicode_isalnum(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011636{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011637 int kind;
11638 void *data;
11639 Py_ssize_t len, i;
11640
11641 if (PyUnicode_READY(self) == -1)
11642 return NULL;
11643
11644 kind = PyUnicode_KIND(self);
11645 data = PyUnicode_DATA(self);
11646 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011647
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011648 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011649 if (len == 1) {
11650 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11651 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11652 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011653
11654 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011655 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011656 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011657
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011658 for (i = 0; i < len; i++) {
11659 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011660 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011661 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011662 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011663 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011664}
11665
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011666PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011667 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011668\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011669Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011670False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011671
11672static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011673unicode_isdecimal(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011674{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011675 Py_ssize_t i, length;
11676 int kind;
11677 void *data;
11678
11679 if (PyUnicode_READY(self) == -1)
11680 return NULL;
11681 length = PyUnicode_GET_LENGTH(self);
11682 kind = PyUnicode_KIND(self);
11683 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011684
Guido van Rossumd57fd912000-03-10 22:53:23 +000011685 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011686 if (length == 1)
11687 return PyBool_FromLong(
11688 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011689
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011690 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011691 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011692 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011693
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011694 for (i = 0; i < length; i++) {
11695 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011696 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011697 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011698 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011699}
11700
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011701PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011702 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011703\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011704Return True if all characters in S are digits\n\
11705and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011706
11707static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011708unicode_isdigit(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011709{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011710 Py_ssize_t i, length;
11711 int kind;
11712 void *data;
11713
11714 if (PyUnicode_READY(self) == -1)
11715 return NULL;
11716 length = PyUnicode_GET_LENGTH(self);
11717 kind = PyUnicode_KIND(self);
11718 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011719
Guido van Rossumd57fd912000-03-10 22:53:23 +000011720 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011721 if (length == 1) {
11722 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11723 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11724 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011725
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011726 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011727 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011728 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011729
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011730 for (i = 0; i < length; i++) {
11731 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011732 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011733 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011734 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011735}
11736
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011737PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011738 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011739\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011740Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011741False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011742
11743static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011744unicode_isnumeric(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011745{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011746 Py_ssize_t i, length;
11747 int kind;
11748 void *data;
11749
11750 if (PyUnicode_READY(self) == -1)
11751 return NULL;
11752 length = PyUnicode_GET_LENGTH(self);
11753 kind = PyUnicode_KIND(self);
11754 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011755
Guido van Rossumd57fd912000-03-10 22:53:23 +000011756 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011757 if (length == 1)
11758 return PyBool_FromLong(
11759 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011760
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011761 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011762 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011763 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011764
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011765 for (i = 0; i < length; i++) {
11766 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011767 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011768 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011769 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011770}
11771
Martin v. Löwis47383402007-08-15 07:32:56 +000011772int
11773PyUnicode_IsIdentifier(PyObject *self)
11774{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011775 int kind;
11776 void *data;
11777 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011778 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000011779
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011780 if (PyUnicode_READY(self) == -1) {
11781 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000011782 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011783 }
11784
11785 /* Special case for empty strings */
11786 if (PyUnicode_GET_LENGTH(self) == 0)
11787 return 0;
11788 kind = PyUnicode_KIND(self);
11789 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000011790
11791 /* PEP 3131 says that the first character must be in
11792 XID_Start and subsequent characters in XID_Continue,
11793 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000011794 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000011795 letters, digits, underscore). However, given the current
11796 definition of XID_Start and XID_Continue, it is sufficient
11797 to check just for these, except that _ must be allowed
11798 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011799 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050011800 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000011801 return 0;
11802
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040011803 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011804 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011805 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000011806 return 1;
11807}
11808
11809PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011810 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000011811\n\
11812Return True if S is a valid identifier according\n\
Raymond Hettinger378170d2013-03-23 08:21:12 -070011813to the language definition.\n\
11814\n\
11815Use keyword.iskeyword() to test for reserved identifiers\n\
11816such as \"def\" and \"class\".\n");
Martin v. Löwis47383402007-08-15 07:32:56 +000011817
11818static PyObject*
11819unicode_isidentifier(PyObject *self)
11820{
11821 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11822}
11823
Georg Brandl559e5d72008-06-11 18:37:52 +000011824PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011825 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000011826\n\
11827Return True if all characters in S are considered\n\
11828printable in repr() or S is empty, False otherwise.");
11829
11830static PyObject*
11831unicode_isprintable(PyObject *self)
11832{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011833 Py_ssize_t i, length;
11834 int kind;
11835 void *data;
11836
11837 if (PyUnicode_READY(self) == -1)
11838 return NULL;
11839 length = PyUnicode_GET_LENGTH(self);
11840 kind = PyUnicode_KIND(self);
11841 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000011842
11843 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011844 if (length == 1)
11845 return PyBool_FromLong(
11846 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000011847
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011848 for (i = 0; i < length; i++) {
11849 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011850 Py_RETURN_FALSE;
11851 }
11852 }
11853 Py_RETURN_TRUE;
11854}
11855
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011856PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000011857 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011858\n\
11859Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000011860iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011861
11862static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011863unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011864{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011865 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011866}
11867
Martin v. Löwis18e16552006-02-15 17:27:45 +000011868static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011869unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011870{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011871 if (PyUnicode_READY(self) == -1)
11872 return -1;
11873 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011874}
11875
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011876PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011877 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011878\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011879Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011880done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011881
11882static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011883unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011884{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011885 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011886 Py_UCS4 fillchar = ' ';
11887
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011888 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011889 return NULL;
11890
Benjamin Petersonbac79492012-01-14 13:34:47 -050011891 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010011892 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011893
Victor Stinnerc4b49542011-12-11 22:44:26 +010011894 if (PyUnicode_GET_LENGTH(self) >= width)
11895 return unicode_result_unchanged(self);
11896
11897 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011898}
11899
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011900PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011901 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011902\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011903Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011904
11905static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011906unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011907{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050011908 if (PyUnicode_READY(self) == -1)
11909 return NULL;
11910 if (PyUnicode_IS_ASCII(self))
11911 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010011912 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011913}
11914
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011915#define LEFTSTRIP 0
11916#define RIGHTSTRIP 1
11917#define BOTHSTRIP 2
11918
11919/* Arrays indexed by above */
11920static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
11921
11922#define STRIPNAME(i) (stripformat[i]+3)
11923
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011924/* externally visible for str.strip(unicode) */
11925PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011926_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011927{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011928 void *data;
11929 int kind;
11930 Py_ssize_t i, j, len;
11931 BLOOM_MASK sepmask;
Victor Stinnerb3a60142013-04-09 22:19:21 +020011932 Py_ssize_t seplen;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011933
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011934 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
11935 return NULL;
11936
11937 kind = PyUnicode_KIND(self);
11938 data = PyUnicode_DATA(self);
11939 len = PyUnicode_GET_LENGTH(self);
Victor Stinnerb3a60142013-04-09 22:19:21 +020011940 seplen = PyUnicode_GET_LENGTH(sepobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011941 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11942 PyUnicode_DATA(sepobj),
Victor Stinnerb3a60142013-04-09 22:19:21 +020011943 seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011944
Benjamin Peterson14339b62009-01-31 16:36:08 +000011945 i = 0;
11946 if (striptype != RIGHTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020011947 while (i < len) {
11948 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11949 if (!BLOOM(sepmask, ch))
11950 break;
11951 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
11952 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000011953 i++;
11954 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011955 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011956
Benjamin Peterson14339b62009-01-31 16:36:08 +000011957 j = len;
11958 if (striptype != LEFTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020011959 j--;
11960 while (j >= i) {
11961 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
11962 if (!BLOOM(sepmask, ch))
11963 break;
11964 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
11965 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000011966 j--;
Victor Stinnerb3a60142013-04-09 22:19:21 +020011967 }
11968
Benjamin Peterson29060642009-01-31 22:14:21 +000011969 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011970 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011971
Victor Stinner7931d9a2011-11-04 00:22:48 +010011972 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011973}
11974
11975PyObject*
11976PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
11977{
11978 unsigned char *data;
11979 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020011980 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011981
Victor Stinnerde636f32011-10-01 03:55:54 +020011982 if (PyUnicode_READY(self) == -1)
11983 return NULL;
11984
Victor Stinner684d5fd2012-05-03 02:32:34 +020011985 length = PyUnicode_GET_LENGTH(self);
11986 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020011987
Victor Stinner684d5fd2012-05-03 02:32:34 +020011988 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010011989 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011990
Victor Stinnerde636f32011-10-01 03:55:54 +020011991 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011992 PyErr_SetString(PyExc_IndexError, "string index out of range");
11993 return NULL;
11994 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020011995 if (start >= length || end < start)
11996 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner12bab6d2011-10-01 01:53:49 +020011997
Victor Stinner684d5fd2012-05-03 02:32:34 +020011998 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020011999 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020012000 data = PyUnicode_1BYTE_DATA(self);
Victor Stinnerd3f08822012-05-29 12:57:52 +020012001 return _PyUnicode_FromASCII((char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020012002 }
12003 else {
12004 kind = PyUnicode_KIND(self);
12005 data = PyUnicode_1BYTE_DATA(self);
12006 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012007 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020012008 length);
12009 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012010}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012011
12012static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012013do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012014{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012015 Py_ssize_t len, i, j;
12016
12017 if (PyUnicode_READY(self) == -1)
12018 return NULL;
12019
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012020 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012021
Victor Stinnercc7af722013-04-09 22:39:24 +020012022 if (PyUnicode_IS_ASCII(self)) {
12023 Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12024
12025 i = 0;
12026 if (striptype != RIGHTSTRIP) {
12027 while (i < len) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012028 Py_UCS1 ch = data[i];
Victor Stinnercc7af722013-04-09 22:39:24 +020012029 if (!_Py_ascii_whitespace[ch])
12030 break;
12031 i++;
12032 }
12033 }
12034
12035 j = len;
12036 if (striptype != LEFTSTRIP) {
12037 j--;
12038 while (j >= i) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012039 Py_UCS1 ch = data[j];
Victor Stinnercc7af722013-04-09 22:39:24 +020012040 if (!_Py_ascii_whitespace[ch])
12041 break;
12042 j--;
12043 }
12044 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012045 }
12046 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012047 else {
12048 int kind = PyUnicode_KIND(self);
12049 void *data = PyUnicode_DATA(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012050
Victor Stinnercc7af722013-04-09 22:39:24 +020012051 i = 0;
12052 if (striptype != RIGHTSTRIP) {
12053 while (i < len) {
12054 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12055 if (!Py_UNICODE_ISSPACE(ch))
12056 break;
12057 i++;
12058 }
Victor Stinner9c79e412013-04-09 22:21:08 +020012059 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012060
12061 j = len;
12062 if (striptype != LEFTSTRIP) {
12063 j--;
12064 while (j >= i) {
12065 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12066 if (!Py_UNICODE_ISSPACE(ch))
12067 break;
12068 j--;
12069 }
12070 j++;
12071 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012072 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012073
Victor Stinner7931d9a2011-11-04 00:22:48 +010012074 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012075}
12076
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012077
12078static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012079do_argstrip(PyObject *self, int striptype, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012080{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012081 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012082
Serhiy Storchakac6792272013-10-19 21:03:34 +030012083 if (!PyArg_ParseTuple(args, stripformat[striptype], &sep))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012084 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012085
Benjamin Peterson14339b62009-01-31 16:36:08 +000012086 if (sep != NULL && sep != Py_None) {
12087 if (PyUnicode_Check(sep))
12088 return _PyUnicode_XStrip(self, striptype, sep);
12089 else {
12090 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012091 "%s arg must be None or str",
12092 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012093 return NULL;
12094 }
12095 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012096
Benjamin Peterson14339b62009-01-31 16:36:08 +000012097 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012098}
12099
12100
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012101PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012102 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012103\n\
12104Return a copy of the string S with leading and trailing\n\
12105whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012106If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012107
12108static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012109unicode_strip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012110{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012111 if (PyTuple_GET_SIZE(args) == 0)
12112 return do_strip(self, BOTHSTRIP); /* Common case */
12113 else
12114 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012115}
12116
12117
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012118PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012119 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012120\n\
12121Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012122If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012123
12124static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012125unicode_lstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012126{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012127 if (PyTuple_GET_SIZE(args) == 0)
12128 return do_strip(self, LEFTSTRIP); /* Common case */
12129 else
12130 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012131}
12132
12133
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012134PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012135 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012136\n\
12137Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012138If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012139
12140static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012141unicode_rstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012142{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012143 if (PyTuple_GET_SIZE(args) == 0)
12144 return do_strip(self, RIGHTSTRIP); /* Common case */
12145 else
12146 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012147}
12148
12149
Guido van Rossumd57fd912000-03-10 22:53:23 +000012150static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012151unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012152{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012153 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012154 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012155
Serhiy Storchaka05997252013-01-26 12:14:02 +020012156 if (len < 1)
12157 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012158
Victor Stinnerc4b49542011-12-11 22:44:26 +010012159 /* no repeat, return original string */
12160 if (len == 1)
12161 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000012162
Benjamin Petersonbac79492012-01-14 13:34:47 -050012163 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012164 return NULL;
12165
Victor Stinnerc759f3e2011-10-01 03:09:58 +020012166 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020012167 PyErr_SetString(PyExc_OverflowError,
12168 "repeated string is too long");
12169 return NULL;
12170 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012171 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012172
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012173 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012174 if (!u)
12175 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012176 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012177
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012178 if (PyUnicode_GET_LENGTH(str) == 1) {
12179 const int kind = PyUnicode_KIND(str);
12180 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010012181 if (kind == PyUnicode_1BYTE_KIND) {
12182 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012183 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010012184 }
12185 else if (kind == PyUnicode_2BYTE_KIND) {
12186 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012187 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010012188 ucs2[n] = fill_char;
12189 } else {
12190 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12191 assert(kind == PyUnicode_4BYTE_KIND);
12192 for (n = 0; n < len; ++n)
12193 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012194 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012195 }
12196 else {
12197 /* number of characters copied this far */
12198 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012199 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012200 char *to = (char *) PyUnicode_DATA(u);
12201 Py_MEMCPY(to, PyUnicode_DATA(str),
12202 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000012203 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012204 n = (done <= nchars-done) ? done : nchars-done;
12205 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012206 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000012207 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012208 }
12209
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012210 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012211 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012212}
12213
Alexander Belopolsky40018472011-02-26 01:02:56 +000012214PyObject *
12215PyUnicode_Replace(PyObject *obj,
12216 PyObject *subobj,
12217 PyObject *replobj,
12218 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012219{
12220 PyObject *self;
12221 PyObject *str1;
12222 PyObject *str2;
12223 PyObject *result;
12224
12225 self = PyUnicode_FromObject(obj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012226 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012227 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012228 str1 = PyUnicode_FromObject(subobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012229 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012230 Py_DECREF(self);
12231 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012232 }
12233 str2 = PyUnicode_FromObject(replobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012234 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012235 Py_DECREF(self);
12236 Py_DECREF(str1);
12237 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012238 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060012239 if (PyUnicode_READY(self) == -1 ||
12240 PyUnicode_READY(str1) == -1 ||
12241 PyUnicode_READY(str2) == -1)
12242 result = NULL;
12243 else
12244 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012245 Py_DECREF(self);
12246 Py_DECREF(str1);
12247 Py_DECREF(str2);
12248 return result;
12249}
12250
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012251PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000012252 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012253\n\
12254Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000012255old replaced by new. If the optional argument count is\n\
12256given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012257
12258static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012259unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012260{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012261 PyObject *str1;
12262 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012263 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012264 PyObject *result;
12265
Martin v. Löwis18e16552006-02-15 17:27:45 +000012266 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012267 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060012268 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012269 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012270 str1 = PyUnicode_FromObject(str1);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012271 if (str1 == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012272 return NULL;
12273 str2 = PyUnicode_FromObject(str2);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012274 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012275 Py_DECREF(str1);
12276 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000012277 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060012278 if (PyUnicode_READY(str1) == -1 || PyUnicode_READY(str2) == -1)
12279 result = NULL;
12280 else
12281 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012282
12283 Py_DECREF(str1);
12284 Py_DECREF(str2);
12285 return result;
12286}
12287
Alexander Belopolsky40018472011-02-26 01:02:56 +000012288static PyObject *
12289unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012290{
Walter Dörwald79e913e2007-05-12 11:08:06 +000012291 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012292 Py_ssize_t isize;
12293 Py_ssize_t osize, squote, dquote, i, o;
12294 Py_UCS4 max, quote;
Victor Stinner55c08782013-04-14 18:45:39 +020012295 int ikind, okind, unchanged;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012296 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012297
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012298 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012299 return NULL;
12300
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012301 isize = PyUnicode_GET_LENGTH(unicode);
12302 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012303
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012304 /* Compute length of output, quote characters, and
12305 maximum character */
Victor Stinner55c08782013-04-14 18:45:39 +020012306 osize = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012307 max = 127;
12308 squote = dquote = 0;
12309 ikind = PyUnicode_KIND(unicode);
12310 for (i = 0; i < isize; i++) {
12311 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12312 switch (ch) {
12313 case '\'': squote++; osize++; break;
12314 case '"': dquote++; osize++; break;
12315 case '\\': case '\t': case '\r': case '\n':
12316 osize += 2; break;
12317 default:
12318 /* Fast-path ASCII */
12319 if (ch < ' ' || ch == 0x7f)
12320 osize += 4; /* \xHH */
12321 else if (ch < 0x7f)
12322 osize++;
12323 else if (Py_UNICODE_ISPRINTABLE(ch)) {
12324 osize++;
12325 max = ch > max ? ch : max;
12326 }
12327 else if (ch < 0x100)
12328 osize += 4; /* \xHH */
12329 else if (ch < 0x10000)
12330 osize += 6; /* \uHHHH */
12331 else
12332 osize += 10; /* \uHHHHHHHH */
12333 }
12334 }
12335
12336 quote = '\'';
Victor Stinner55c08782013-04-14 18:45:39 +020012337 unchanged = (osize == isize);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012338 if (squote) {
Victor Stinner55c08782013-04-14 18:45:39 +020012339 unchanged = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012340 if (dquote)
12341 /* Both squote and dquote present. Use squote,
12342 and escape them */
12343 osize += squote;
12344 else
12345 quote = '"';
12346 }
Victor Stinner55c08782013-04-14 18:45:39 +020012347 osize += 2; /* quotes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012348
12349 repr = PyUnicode_New(osize, max);
12350 if (repr == NULL)
12351 return NULL;
12352 okind = PyUnicode_KIND(repr);
12353 odata = PyUnicode_DATA(repr);
12354
12355 PyUnicode_WRITE(okind, odata, 0, quote);
12356 PyUnicode_WRITE(okind, odata, osize-1, quote);
Victor Stinner55c08782013-04-14 18:45:39 +020012357 if (unchanged) {
12358 _PyUnicode_FastCopyCharacters(repr, 1,
12359 unicode, 0,
12360 isize);
12361 }
12362 else {
12363 for (i = 0, o = 1; i < isize; i++) {
12364 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012365
Victor Stinner55c08782013-04-14 18:45:39 +020012366 /* Escape quotes and backslashes */
12367 if ((ch == quote) || (ch == '\\')) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012368 PyUnicode_WRITE(okind, odata, o++, '\\');
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012369 PyUnicode_WRITE(okind, odata, o++, ch);
Victor Stinner55c08782013-04-14 18:45:39 +020012370 continue;
12371 }
12372
12373 /* Map special whitespace to '\t', \n', '\r' */
12374 if (ch == '\t') {
12375 PyUnicode_WRITE(okind, odata, o++, '\\');
12376 PyUnicode_WRITE(okind, odata, o++, 't');
12377 }
12378 else if (ch == '\n') {
12379 PyUnicode_WRITE(okind, odata, o++, '\\');
12380 PyUnicode_WRITE(okind, odata, o++, 'n');
12381 }
12382 else if (ch == '\r') {
12383 PyUnicode_WRITE(okind, odata, o++, '\\');
12384 PyUnicode_WRITE(okind, odata, o++, 'r');
12385 }
12386
12387 /* Map non-printable US ASCII to '\xhh' */
12388 else if (ch < ' ' || ch == 0x7F) {
12389 PyUnicode_WRITE(okind, odata, o++, '\\');
12390 PyUnicode_WRITE(okind, odata, o++, 'x');
12391 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12392 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12393 }
12394
12395 /* Copy ASCII characters as-is */
12396 else if (ch < 0x7F) {
12397 PyUnicode_WRITE(okind, odata, o++, ch);
12398 }
12399
12400 /* Non-ASCII characters */
12401 else {
12402 /* Map Unicode whitespace and control characters
12403 (categories Z* and C* except ASCII space)
12404 */
12405 if (!Py_UNICODE_ISPRINTABLE(ch)) {
12406 PyUnicode_WRITE(okind, odata, o++, '\\');
12407 /* Map 8-bit characters to '\xhh' */
12408 if (ch <= 0xff) {
12409 PyUnicode_WRITE(okind, odata, o++, 'x');
12410 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12411 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12412 }
12413 /* Map 16-bit characters to '\uxxxx' */
12414 else if (ch <= 0xffff) {
12415 PyUnicode_WRITE(okind, odata, o++, 'u');
12416 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12417 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12418 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12419 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12420 }
12421 /* Map 21-bit characters to '\U00xxxxxx' */
12422 else {
12423 PyUnicode_WRITE(okind, odata, o++, 'U');
12424 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12425 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12426 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12427 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12428 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12429 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12430 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12431 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12432 }
12433 }
12434 /* Copy characters as-is */
12435 else {
12436 PyUnicode_WRITE(okind, odata, o++, ch);
12437 }
Georg Brandl559e5d72008-06-11 18:37:52 +000012438 }
12439 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012440 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012441 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012442 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012443 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012444}
12445
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012446PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012447 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012448\n\
12449Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012450such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012451arguments start and end are interpreted as in slice notation.\n\
12452\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012453Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012454
12455static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012456unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012457{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012458 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012459 Py_ssize_t start;
12460 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012461 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012462
Jesus Ceaac451502011-04-20 17:09:23 +020012463 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
12464 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012465 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012466
Christian Heimesea71a522013-06-29 21:17:34 +020012467 if (PyUnicode_READY(self) == -1) {
12468 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012469 return NULL;
Christian Heimesea71a522013-06-29 21:17:34 +020012470 }
12471 if (PyUnicode_READY(substring) == -1) {
12472 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012473 return NULL;
Christian Heimesea71a522013-06-29 21:17:34 +020012474 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012475
Victor Stinner7931d9a2011-11-04 00:22:48 +010012476 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012477
12478 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012479
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012480 if (result == -2)
12481 return NULL;
12482
Christian Heimes217cfd12007-12-02 14:31:20 +000012483 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012484}
12485
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012486PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012487 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012488\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012489Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012490
12491static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012492unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012493{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012494 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012495 Py_ssize_t start;
12496 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012497 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012498
Jesus Ceaac451502011-04-20 17:09:23 +020012499 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
12500 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012501 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012502
Christian Heimesea71a522013-06-29 21:17:34 +020012503 if (PyUnicode_READY(self) == -1) {
12504 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012505 return NULL;
Christian Heimesea71a522013-06-29 21:17:34 +020012506 }
12507 if (PyUnicode_READY(substring) == -1) {
12508 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012509 return NULL;
Christian Heimesea71a522013-06-29 21:17:34 +020012510 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012511
Victor Stinner7931d9a2011-11-04 00:22:48 +010012512 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012513
12514 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012515
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012516 if (result == -2)
12517 return NULL;
12518
Guido van Rossumd57fd912000-03-10 22:53:23 +000012519 if (result < 0) {
12520 PyErr_SetString(PyExc_ValueError, "substring not found");
12521 return NULL;
12522 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012523
Christian Heimes217cfd12007-12-02 14:31:20 +000012524 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012525}
12526
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012527PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012528 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012529\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012530Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012531done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012532
12533static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012534unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012535{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012536 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012537 Py_UCS4 fillchar = ' ';
12538
Victor Stinnere9a29352011-10-01 02:14:59 +020012539 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012540 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012541
Benjamin Petersonbac79492012-01-14 13:34:47 -050012542 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012543 return NULL;
12544
Victor Stinnerc4b49542011-12-11 22:44:26 +010012545 if (PyUnicode_GET_LENGTH(self) >= width)
12546 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012547
Victor Stinnerc4b49542011-12-11 22:44:26 +010012548 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012549}
12550
Alexander Belopolsky40018472011-02-26 01:02:56 +000012551PyObject *
12552PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012553{
12554 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000012555
Guido van Rossumd57fd912000-03-10 22:53:23 +000012556 s = PyUnicode_FromObject(s);
12557 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012558 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012559 if (sep != NULL) {
12560 sep = PyUnicode_FromObject(sep);
12561 if (sep == NULL) {
12562 Py_DECREF(s);
12563 return NULL;
12564 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012565 }
12566
Victor Stinner9310abb2011-10-05 00:59:23 +020012567 result = split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012568
12569 Py_DECREF(s);
12570 Py_XDECREF(sep);
12571 return result;
12572}
12573
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012574PyDoc_STRVAR(split__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012575 "S.split(sep=None, maxsplit=-1) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012576\n\
12577Return a list of the words in S, using sep as the\n\
12578delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000012579splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000012580whitespace string is a separator and empty strings are\n\
12581removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012582
12583static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012584unicode_split(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012585{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012586 static char *kwlist[] = {"sep", "maxsplit", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000012587 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012588 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012589
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012590 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:split",
12591 kwlist, &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012592 return NULL;
12593
12594 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012595 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012596 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012597 return split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012598 else
Victor Stinner7931d9a2011-11-04 00:22:48 +010012599 return PyUnicode_Split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012600}
12601
Thomas Wouters477c8d52006-05-27 19:21:47 +000012602PyObject *
12603PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
12604{
12605 PyObject* str_obj;
12606 PyObject* sep_obj;
12607 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012608 int kind1, kind2, kind;
12609 void *buf1 = NULL, *buf2 = NULL;
12610 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012611
12612 str_obj = PyUnicode_FromObject(str_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012613 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012614 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012615 sep_obj = PyUnicode_FromObject(sep_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012616 if (!sep_obj) {
12617 Py_DECREF(str_obj);
12618 return NULL;
12619 }
12620 if (PyUnicode_READY(sep_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
12621 Py_DECREF(sep_obj);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012622 Py_DECREF(str_obj);
12623 return NULL;
12624 }
12625
Victor Stinner14f8f022011-10-05 20:58:25 +020012626 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012627 kind2 = PyUnicode_KIND(sep_obj);
Victor Stinner14f8f022011-10-05 20:58:25 +020012628 kind = Py_MAX(kind1, kind2);
12629 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012630 if (kind1 != kind)
Victor Stinner14f8f022011-10-05 20:58:25 +020012631 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012632 if (!buf1)
12633 goto onError;
12634 buf2 = PyUnicode_DATA(sep_obj);
12635 if (kind2 != kind)
12636 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12637 if (!buf2)
12638 goto onError;
12639 len1 = PyUnicode_GET_LENGTH(str_obj);
12640 len2 = PyUnicode_GET_LENGTH(sep_obj);
12641
Benjamin Petersonead6b532011-12-20 17:23:42 -060012642 switch (PyUnicode_KIND(str_obj)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012643 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012644 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12645 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12646 else
12647 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012648 break;
12649 case PyUnicode_2BYTE_KIND:
12650 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12651 break;
12652 case PyUnicode_4BYTE_KIND:
12653 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12654 break;
12655 default:
12656 assert(0);
12657 out = 0;
12658 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012659
12660 Py_DECREF(sep_obj);
12661 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012662 if (kind1 != kind)
12663 PyMem_Free(buf1);
12664 if (kind2 != kind)
12665 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012666
12667 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012668 onError:
12669 Py_DECREF(sep_obj);
12670 Py_DECREF(str_obj);
12671 if (kind1 != kind && buf1)
12672 PyMem_Free(buf1);
12673 if (kind2 != kind && buf2)
12674 PyMem_Free(buf2);
12675 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012676}
12677
12678
12679PyObject *
12680PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
12681{
12682 PyObject* str_obj;
12683 PyObject* sep_obj;
12684 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012685 int kind1, kind2, kind;
12686 void *buf1 = NULL, *buf2 = NULL;
12687 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012688
12689 str_obj = PyUnicode_FromObject(str_in);
12690 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012691 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012692 sep_obj = PyUnicode_FromObject(sep_in);
12693 if (!sep_obj) {
12694 Py_DECREF(str_obj);
12695 return NULL;
12696 }
12697
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012698 kind1 = PyUnicode_KIND(str_in);
12699 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020012700 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012701 buf1 = PyUnicode_DATA(str_in);
12702 if (kind1 != kind)
12703 buf1 = _PyUnicode_AsKind(str_in, kind);
12704 if (!buf1)
12705 goto onError;
12706 buf2 = PyUnicode_DATA(sep_obj);
12707 if (kind2 != kind)
12708 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12709 if (!buf2)
12710 goto onError;
12711 len1 = PyUnicode_GET_LENGTH(str_obj);
12712 len2 = PyUnicode_GET_LENGTH(sep_obj);
12713
Benjamin Petersonead6b532011-12-20 17:23:42 -060012714 switch (PyUnicode_KIND(str_in)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012715 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012716 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12717 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12718 else
12719 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012720 break;
12721 case PyUnicode_2BYTE_KIND:
12722 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12723 break;
12724 case PyUnicode_4BYTE_KIND:
12725 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12726 break;
12727 default:
12728 assert(0);
12729 out = 0;
12730 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012731
12732 Py_DECREF(sep_obj);
12733 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012734 if (kind1 != kind)
12735 PyMem_Free(buf1);
12736 if (kind2 != kind)
12737 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012738
12739 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012740 onError:
12741 Py_DECREF(sep_obj);
12742 Py_DECREF(str_obj);
12743 if (kind1 != kind && buf1)
12744 PyMem_Free(buf1);
12745 if (kind2 != kind && buf2)
12746 PyMem_Free(buf2);
12747 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012748}
12749
12750PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012751 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012752\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012753Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012754the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012755found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012756
12757static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012758unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012759{
Victor Stinner9310abb2011-10-05 00:59:23 +020012760 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012761}
12762
12763PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000012764 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012765\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012766Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012767the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012768separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012769
12770static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012771unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012772{
Victor Stinner9310abb2011-10-05 00:59:23 +020012773 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012774}
12775
Alexander Belopolsky40018472011-02-26 01:02:56 +000012776PyObject *
12777PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012778{
12779 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012780
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012781 s = PyUnicode_FromObject(s);
12782 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012783 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012784 if (sep != NULL) {
12785 sep = PyUnicode_FromObject(sep);
12786 if (sep == NULL) {
12787 Py_DECREF(s);
12788 return NULL;
12789 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012790 }
12791
Victor Stinner9310abb2011-10-05 00:59:23 +020012792 result = rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012793
12794 Py_DECREF(s);
12795 Py_XDECREF(sep);
12796 return result;
12797}
12798
12799PyDoc_STRVAR(rsplit__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012800 "S.rsplit(sep=None, maxsplit=-1) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012801\n\
12802Return a list of the words in S, using sep as the\n\
12803delimiter string, starting at the end of the string and\n\
12804working to the front. If maxsplit is given, at most maxsplit\n\
12805splits are done. If sep is not specified, any whitespace string\n\
12806is a separator.");
12807
12808static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012809unicode_rsplit(PyObject *self, PyObject *args, PyObject *kwds)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012810{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012811 static char *kwlist[] = {"sep", "maxsplit", 0};
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012812 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012813 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012814
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012815 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:rsplit",
12816 kwlist, &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012817 return NULL;
12818
12819 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012820 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012821 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012822 return rsplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012823 else
Victor Stinner9310abb2011-10-05 00:59:23 +020012824 return PyUnicode_RSplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012825}
12826
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012827PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012828 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012829\n\
12830Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000012831Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012832is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012833
12834static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012835unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012836{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012837 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000012838 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012839
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012840 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12841 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012842 return NULL;
12843
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012844 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012845}
12846
12847static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000012848PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012849{
Victor Stinnerc4b49542011-12-11 22:44:26 +010012850 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012851}
12852
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012853PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012854 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012855\n\
12856Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012857and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012858
12859static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012860unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012861{
Benjamin Petersoneea48462012-01-16 14:28:50 -050012862 if (PyUnicode_READY(self) == -1)
12863 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012864 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012865}
12866
Larry Hastings61272b72014-01-07 12:41:53 -080012867/*[clinic input]
Georg Brandlceee0772007-11-27 23:48:05 +000012868
Larry Hastings31826802013-10-19 00:09:25 -070012869@staticmethod
12870str.maketrans as unicode_maketrans
12871
12872 x: object
12873
12874 y: unicode=NULL
12875
12876 z: unicode=NULL
12877
12878 /
12879
12880Return a translation table usable for str.translate().
12881
12882If there is only one argument, it must be a dictionary mapping Unicode
12883ordinals (integers) or characters to Unicode ordinals, strings or None.
12884Character keys will be then converted to ordinals.
12885If there are two arguments, they must be strings of equal length, and
12886in the resulting dictionary, each character in x will be mapped to the
12887character at the same position in y. If there is a third argument, it
12888must be a string, whose characters will be mapped to None in the result.
Larry Hastings61272b72014-01-07 12:41:53 -080012889[clinic start generated code]*/
Larry Hastings31826802013-10-19 00:09:25 -070012890
12891PyDoc_STRVAR(unicode_maketrans__doc__,
Larry Hastings2623c8c2014-02-08 22:15:29 -080012892"maketrans(x, y=None, z=None, /)\n"
12893"--\n"
12894"\n"
Larry Hastings31826802013-10-19 00:09:25 -070012895"Return a translation table usable for str.translate().\n"
12896"\n"
Larry Hastings31826802013-10-19 00:09:25 -070012897"If there is only one argument, it must be a dictionary mapping Unicode\n"
12898"ordinals (integers) or characters to Unicode ordinals, strings or None.\n"
12899"Character keys will be then converted to ordinals.\n"
12900"If there are two arguments, they must be strings of equal length, and\n"
12901"in the resulting dictionary, each character in x will be mapped to the\n"
12902"character at the same position in y. If there is a third argument, it\n"
12903"must be a string, whose characters will be mapped to None in the result.");
12904
12905#define UNICODE_MAKETRANS_METHODDEF \
12906 {"maketrans", (PyCFunction)unicode_maketrans, METH_VARARGS|METH_STATIC, unicode_maketrans__doc__},
12907
12908static PyObject *
Larry Hastings5c661892014-01-24 06:17:25 -080012909unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z);
Larry Hastings31826802013-10-19 00:09:25 -070012910
12911static PyObject *
Larry Hastingsebdcb502013-11-23 14:54:00 -080012912unicode_maketrans(void *null, PyObject *args)
Georg Brandlceee0772007-11-27 23:48:05 +000012913{
Larry Hastings31826802013-10-19 00:09:25 -070012914 PyObject *return_value = NULL;
12915 PyObject *x;
12916 PyObject *y = NULL;
12917 PyObject *z = NULL;
12918
12919 if (!PyArg_ParseTuple(args,
12920 "O|UU:maketrans",
12921 &x, &y, &z))
12922 goto exit;
Larry Hastings5c661892014-01-24 06:17:25 -080012923 return_value = unicode_maketrans_impl(x, y, z);
Larry Hastings31826802013-10-19 00:09:25 -070012924
12925exit:
12926 return return_value;
12927}
12928
12929static PyObject *
Larry Hastings5c661892014-01-24 06:17:25 -080012930unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
Larry Hastings2623c8c2014-02-08 22:15:29 -080012931/*[clinic end generated code: output=566edf630f77436a input=7bfbf529a293c6c5]*/
Larry Hastings31826802013-10-19 00:09:25 -070012932{
Georg Brandlceee0772007-11-27 23:48:05 +000012933 PyObject *new = NULL, *key, *value;
12934 Py_ssize_t i = 0;
12935 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012936
Georg Brandlceee0772007-11-27 23:48:05 +000012937 new = PyDict_New();
12938 if (!new)
12939 return NULL;
12940 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012941 int x_kind, y_kind, z_kind;
12942 void *x_data, *y_data, *z_data;
12943
Georg Brandlceee0772007-11-27 23:48:05 +000012944 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000012945 if (!PyUnicode_Check(x)) {
12946 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12947 "be a string if there is a second argument");
12948 goto err;
12949 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012950 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012951 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12952 "arguments must have equal length");
12953 goto err;
12954 }
12955 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012956 x_kind = PyUnicode_KIND(x);
12957 y_kind = PyUnicode_KIND(y);
12958 x_data = PyUnicode_DATA(x);
12959 y_data = PyUnicode_DATA(y);
12960 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
12961 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012962 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000012963 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060012964 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012965 if (!value) {
12966 Py_DECREF(key);
12967 goto err;
12968 }
Georg Brandlceee0772007-11-27 23:48:05 +000012969 res = PyDict_SetItem(new, key, value);
12970 Py_DECREF(key);
12971 Py_DECREF(value);
12972 if (res < 0)
12973 goto err;
12974 }
12975 /* create entries for deleting chars in z */
12976 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012977 z_kind = PyUnicode_KIND(z);
12978 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012979 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012980 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012981 if (!key)
12982 goto err;
12983 res = PyDict_SetItem(new, key, Py_None);
12984 Py_DECREF(key);
12985 if (res < 0)
12986 goto err;
12987 }
12988 }
12989 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012990 int kind;
12991 void *data;
12992
Georg Brandlceee0772007-11-27 23:48:05 +000012993 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000012994 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012995 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
12996 "to maketrans it must be a dict");
12997 goto err;
12998 }
12999 /* copy entries into the new dict, converting string keys to int keys */
13000 while (PyDict_Next(x, &i, &key, &value)) {
13001 if (PyUnicode_Check(key)) {
13002 /* convert string keys to integer keys */
13003 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013004 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000013005 PyErr_SetString(PyExc_ValueError, "string keys in translate "
13006 "table must be of length 1");
13007 goto err;
13008 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013009 kind = PyUnicode_KIND(key);
13010 data = PyUnicode_DATA(key);
13011 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000013012 if (!newkey)
13013 goto err;
13014 res = PyDict_SetItem(new, newkey, value);
13015 Py_DECREF(newkey);
13016 if (res < 0)
13017 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000013018 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013019 /* just keep integer keys */
13020 if (PyDict_SetItem(new, key, value) < 0)
13021 goto err;
13022 } else {
13023 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13024 "be strings or integers");
13025 goto err;
13026 }
13027 }
13028 }
13029 return new;
13030 err:
13031 Py_DECREF(new);
13032 return NULL;
13033}
13034
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013035PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013036 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013037\n\
13038Return a copy of the string S, where all characters have been mapped\n\
13039through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000013040Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000013041Unmapped characters are left untouched. Characters mapped to None\n\
13042are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013043
13044static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013045unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013046{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013047 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013048}
13049
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013050PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013051 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013052\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013053Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013054
13055static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020013056unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013057{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050013058 if (PyUnicode_READY(self) == -1)
13059 return NULL;
13060 if (PyUnicode_IS_ASCII(self))
13061 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013062 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013063}
13064
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013065PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013066 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013067\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000013068Pad a numeric string S with zeros on the left, to fill a field\n\
13069of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013070
13071static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020013072unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013073{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013074 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020013075 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013076 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013077 int kind;
13078 void *data;
13079 Py_UCS4 chr;
13080
Martin v. Löwis18e16552006-02-15 17:27:45 +000013081 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000013082 return NULL;
13083
Benjamin Petersonbac79492012-01-14 13:34:47 -050013084 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010013085 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013086
Victor Stinnerc4b49542011-12-11 22:44:26 +010013087 if (PyUnicode_GET_LENGTH(self) >= width)
13088 return unicode_result_unchanged(self);
13089
13090 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013091
13092 u = pad(self, fill, 0, '0');
13093
Walter Dörwald068325e2002-04-15 13:36:47 +000013094 if (u == NULL)
13095 return NULL;
13096
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013097 kind = PyUnicode_KIND(u);
13098 data = PyUnicode_DATA(u);
13099 chr = PyUnicode_READ(kind, data, fill);
13100
13101 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000013102 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013103 PyUnicode_WRITE(kind, data, 0, chr);
13104 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000013105 }
13106
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013107 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010013108 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013109}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013110
13111#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013112static PyObject *
13113unicode__decimal2ascii(PyObject *self)
13114{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013115 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013116}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013117#endif
13118
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013119PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013120 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013121\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013122Return True if S starts with the specified prefix, False otherwise.\n\
13123With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013124With optional end, stop comparing S at that position.\n\
13125prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013126
13127static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013128unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013129 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013130{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013131 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013132 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013133 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013134 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013135 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013136
Jesus Ceaac451502011-04-20 17:09:23 +020013137 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013138 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013139 if (PyTuple_Check(subobj)) {
13140 Py_ssize_t i;
13141 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013142 substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013143 if (substring == NULL)
13144 return NULL;
13145 result = tailmatch(self, substring, start, end, -1);
13146 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010013147 if (result == -1)
13148 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013149 if (result) {
13150 Py_RETURN_TRUE;
13151 }
13152 }
13153 /* nothing matched */
13154 Py_RETURN_FALSE;
13155 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013156 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030013157 if (substring == NULL) {
13158 if (PyErr_ExceptionMatches(PyExc_TypeError))
13159 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
13160 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013161 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013162 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013163 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013164 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010013165 if (result == -1)
13166 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013167 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013168}
13169
13170
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013171PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013172 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013173\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013174Return True if S ends with the specified suffix, False otherwise.\n\
13175With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013176With optional end, stop comparing S at that position.\n\
13177suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013178
13179static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013180unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013181 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013182{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013183 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013184 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013185 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013186 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013187 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013188
Jesus Ceaac451502011-04-20 17:09:23 +020013189 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013190 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013191 if (PyTuple_Check(subobj)) {
13192 Py_ssize_t i;
13193 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013194 substring = PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000013195 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013196 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000013197 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013198 result = tailmatch(self, substring, start, end, +1);
13199 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010013200 if (result == -1)
13201 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013202 if (result) {
13203 Py_RETURN_TRUE;
13204 }
13205 }
13206 Py_RETURN_FALSE;
13207 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013208 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030013209 if (substring == NULL) {
13210 if (PyErr_ExceptionMatches(PyExc_TypeError))
13211 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
13212 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013213 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013214 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013215 result = tailmatch(self, substring, start, end, +1);
Christian Heimes305e49e2013-06-29 20:41:06 +020013216 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010013217 if (result == -1)
13218 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013219 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013220}
13221
Victor Stinner202fdca2012-05-07 12:47:02 +020013222Py_LOCAL_INLINE(void)
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013223_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013224{
Victor Stinner8f674cc2013-04-17 23:02:17 +020013225 if (!writer->readonly)
13226 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
13227 else {
13228 /* Copy-on-write mode: set buffer size to 0 so
13229 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13230 * next write. */
13231 writer->size = 0;
13232 }
Victor Stinner202fdca2012-05-07 12:47:02 +020013233 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13234 writer->data = PyUnicode_DATA(writer->buffer);
13235 writer->kind = PyUnicode_KIND(writer->buffer);
13236}
13237
Victor Stinnerd3f08822012-05-29 12:57:52 +020013238void
Victor Stinner8f674cc2013-04-17 23:02:17 +020013239_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013240{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013241 memset(writer, 0, sizeof(*writer));
13242#ifdef Py_DEBUG
13243 writer->kind = 5; /* invalid kind */
13244#endif
Victor Stinner8f674cc2013-04-17 23:02:17 +020013245 writer->min_char = 127;
Victor Stinner202fdca2012-05-07 12:47:02 +020013246}
13247
Victor Stinnerd3f08822012-05-29 12:57:52 +020013248int
13249_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13250 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020013251{
Victor Stinner6989ba02013-11-18 21:08:39 +010013252#ifdef MS_WINDOWS
13253 /* On Windows, overallocate by 50% is the best factor */
13254# define OVERALLOCATE_FACTOR 2
13255#else
13256 /* On Linux, overallocate by 25% is the best factor */
13257# define OVERALLOCATE_FACTOR 4
13258#endif
Victor Stinner202fdca2012-05-07 12:47:02 +020013259 Py_ssize_t newlen;
13260 PyObject *newbuffer;
13261
Victor Stinnerd3f08822012-05-29 12:57:52 +020013262 assert(length > 0);
13263
Victor Stinner202fdca2012-05-07 12:47:02 +020013264 if (length > PY_SSIZE_T_MAX - writer->pos) {
13265 PyErr_NoMemory();
13266 return -1;
13267 }
13268 newlen = writer->pos + length;
13269
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070013270 maxchar = Py_MAX(maxchar, writer->min_char);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013271
Victor Stinnerd3f08822012-05-29 12:57:52 +020013272 if (writer->buffer == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020013273 assert(!writer->readonly);
Victor Stinner6989ba02013-11-18 21:08:39 +010013274 if (writer->overallocate
13275 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13276 /* overallocate to limit the number of realloc() */
13277 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013278 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013279 if (newlen < writer->min_length)
13280 newlen = writer->min_length;
13281
Victor Stinnerd3f08822012-05-29 12:57:52 +020013282 writer->buffer = PyUnicode_New(newlen, maxchar);
13283 if (writer->buffer == NULL)
13284 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013285 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013286 else if (newlen > writer->size) {
Victor Stinner6989ba02013-11-18 21:08:39 +010013287 if (writer->overallocate
13288 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13289 /* overallocate to limit the number of realloc() */
13290 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013291 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013292 if (newlen < writer->min_length)
13293 newlen = writer->min_length;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013294
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013295 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020013296 /* resize + widen */
13297 newbuffer = PyUnicode_New(newlen, maxchar);
13298 if (newbuffer == NULL)
13299 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013300 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13301 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020013302 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013303 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020013304 }
13305 else {
13306 newbuffer = resize_compact(writer->buffer, newlen);
13307 if (newbuffer == NULL)
13308 return -1;
13309 }
13310 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013311 }
13312 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013313 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013314 newbuffer = PyUnicode_New(writer->size, maxchar);
13315 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020013316 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013317 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13318 writer->buffer, 0, writer->pos);
13319 Py_DECREF(writer->buffer);
13320 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013321 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013322 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013323 return 0;
Victor Stinner6989ba02013-11-18 21:08:39 +010013324
13325#undef OVERALLOCATE_FACTOR
Victor Stinner202fdca2012-05-07 12:47:02 +020013326}
13327
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013328Py_LOCAL_INLINE(int)
13329_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
Victor Stinnera0dd0212013-04-11 22:09:04 +020013330{
13331 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13332 return -1;
13333 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13334 writer->pos++;
13335 return 0;
13336}
13337
13338int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013339_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13340{
13341 return _PyUnicodeWriter_WriteCharInline(writer, ch);
13342}
13343
13344int
Victor Stinnerd3f08822012-05-29 12:57:52 +020013345_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13346{
13347 Py_UCS4 maxchar;
13348 Py_ssize_t len;
13349
13350 if (PyUnicode_READY(str) == -1)
13351 return -1;
13352 len = PyUnicode_GET_LENGTH(str);
13353 if (len == 0)
13354 return 0;
13355 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13356 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013357 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020013358 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013359 Py_INCREF(str);
13360 writer->buffer = str;
13361 _PyUnicodeWriter_Update(writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013362 writer->pos += len;
13363 return 0;
13364 }
13365 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13366 return -1;
13367 }
13368 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13369 str, 0, len);
13370 writer->pos += len;
13371 return 0;
13372}
13373
Victor Stinnere215d962012-10-06 23:03:36 +020013374int
Victor Stinnercfc4c132013-04-03 01:48:39 +020013375_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13376 Py_ssize_t start, Py_ssize_t end)
13377{
13378 Py_UCS4 maxchar;
13379 Py_ssize_t len;
13380
13381 if (PyUnicode_READY(str) == -1)
13382 return -1;
13383
13384 assert(0 <= start);
13385 assert(end <= PyUnicode_GET_LENGTH(str));
13386 assert(start <= end);
13387
13388 if (end == 0)
13389 return 0;
13390
13391 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
13392 return _PyUnicodeWriter_WriteStr(writer, str);
13393
13394 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
13395 maxchar = _PyUnicode_FindMaxChar(str, start, end);
13396 else
13397 maxchar = writer->maxchar;
13398 len = end - start;
13399
13400 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
13401 return -1;
13402
13403 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13404 str, start, len);
13405 writer->pos += len;
13406 return 0;
13407}
13408
13409int
Victor Stinner4a587072013-11-19 12:54:53 +010013410_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
13411 const char *ascii, Py_ssize_t len)
13412{
13413 if (len == -1)
13414 len = strlen(ascii);
13415
13416 assert(ucs1lib_find_max_char((Py_UCS1*)ascii, (Py_UCS1*)ascii + len) < 128);
13417
13418 if (writer->buffer == NULL && !writer->overallocate) {
13419 PyObject *str;
13420
13421 str = _PyUnicode_FromASCII(ascii, len);
13422 if (str == NULL)
13423 return -1;
13424
13425 writer->readonly = 1;
13426 writer->buffer = str;
13427 _PyUnicodeWriter_Update(writer);
13428 writer->pos += len;
13429 return 0;
13430 }
13431
13432 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
13433 return -1;
13434
13435 switch (writer->kind)
13436 {
13437 case PyUnicode_1BYTE_KIND:
13438 {
13439 const Py_UCS1 *str = (const Py_UCS1 *)ascii;
13440 Py_UCS1 *data = writer->data;
13441
13442 Py_MEMCPY(data + writer->pos, str, len);
13443 break;
13444 }
13445 case PyUnicode_2BYTE_KIND:
13446 {
13447 _PyUnicode_CONVERT_BYTES(
13448 Py_UCS1, Py_UCS2,
13449 ascii, ascii + len,
13450 (Py_UCS2 *)writer->data + writer->pos);
13451 break;
13452 }
13453 case PyUnicode_4BYTE_KIND:
13454 {
13455 _PyUnicode_CONVERT_BYTES(
13456 Py_UCS1, Py_UCS4,
13457 ascii, ascii + len,
13458 (Py_UCS4 *)writer->data + writer->pos);
13459 break;
13460 }
13461 default:
13462 assert(0);
13463 }
13464
13465 writer->pos += len;
13466 return 0;
13467}
13468
13469int
13470_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
13471 const char *str, Py_ssize_t len)
Victor Stinnere215d962012-10-06 23:03:36 +020013472{
13473 Py_UCS4 maxchar;
13474
13475 maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len);
13476 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
13477 return -1;
13478 unicode_write_cstr(writer->buffer, writer->pos, str, len);
13479 writer->pos += len;
13480 return 0;
13481}
13482
Victor Stinnerd3f08822012-05-29 12:57:52 +020013483PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013484_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013485{
Victor Stinner15a0bd32013-07-08 22:29:55 +020013486 PyObject *str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013487 if (writer->pos == 0) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013488 Py_CLEAR(writer->buffer);
Serhiy Storchaka678db842013-01-26 12:16:36 +020013489 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3f08822012-05-29 12:57:52 +020013490 }
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013491 if (writer->readonly) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013492 str = writer->buffer;
13493 writer->buffer = NULL;
13494 assert(PyUnicode_GET_LENGTH(str) == writer->pos);
13495 return str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013496 }
13497 if (PyUnicode_GET_LENGTH(writer->buffer) != writer->pos) {
13498 PyObject *newbuffer;
13499 newbuffer = resize_compact(writer->buffer, writer->pos);
13500 if (newbuffer == NULL) {
Serhiy Storchakadfe98a12014-02-09 13:46:20 +020013501 Py_CLEAR(writer->buffer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013502 return NULL;
13503 }
13504 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013505 }
Victor Stinner15a0bd32013-07-08 22:29:55 +020013506 str = writer->buffer;
13507 writer->buffer = NULL;
13508 assert(_PyUnicode_CheckConsistency(str, 1));
13509 return unicode_result_ready(str);
Victor Stinner202fdca2012-05-07 12:47:02 +020013510}
13511
Victor Stinnerd3f08822012-05-29 12:57:52 +020013512void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013513_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013514{
13515 Py_CLEAR(writer->buffer);
13516}
13517
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013518#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000013519
13520PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013521 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013522\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013523Return a formatted version of S, using substitutions from args and kwargs.\n\
13524The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000013525
Eric Smith27bbca62010-11-04 17:06:58 +000013526PyDoc_STRVAR(format_map__doc__,
13527 "S.format_map(mapping) -> str\n\
13528\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013529Return a formatted version of S, using substitutions from mapping.\n\
13530The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000013531
Eric Smith4a7d76d2008-05-30 18:10:19 +000013532static PyObject *
13533unicode__format__(PyObject* self, PyObject* args)
13534{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013535 PyObject *format_spec;
13536 _PyUnicodeWriter writer;
13537 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000013538
13539 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
13540 return NULL;
13541
Victor Stinnerd3f08822012-05-29 12:57:52 +020013542 if (PyUnicode_READY(self) == -1)
13543 return NULL;
Victor Stinner8f674cc2013-04-17 23:02:17 +020013544 _PyUnicodeWriter_Init(&writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013545 ret = _PyUnicode_FormatAdvancedWriter(&writer,
13546 self, format_spec, 0,
13547 PyUnicode_GET_LENGTH(format_spec));
13548 if (ret == -1) {
13549 _PyUnicodeWriter_Dealloc(&writer);
13550 return NULL;
13551 }
13552 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000013553}
13554
Eric Smith8c663262007-08-25 02:26:07 +000013555PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013556 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013557\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013558Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000013559
13560static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013561unicode__sizeof__(PyObject *v)
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013562{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013563 Py_ssize_t size;
13564
13565 /* If it's a compact object, account for base structure +
13566 character data. */
13567 if (PyUnicode_IS_COMPACT_ASCII(v))
13568 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
13569 else if (PyUnicode_IS_COMPACT(v))
13570 size = sizeof(PyCompactUnicodeObject) +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013571 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013572 else {
13573 /* If it is a two-block object, account for base object, and
13574 for character block if present. */
13575 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020013576 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013577 size += (PyUnicode_GET_LENGTH(v) + 1) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013578 PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013579 }
13580 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020013581 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020013582 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013583 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020013584 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020013585 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013586
13587 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013588}
13589
13590PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013591 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013592
13593static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020013594unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013595{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010013596 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013597 if (!copy)
13598 return NULL;
13599 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013600}
13601
Guido van Rossumd57fd912000-03-10 22:53:23 +000013602static PyMethodDef unicode_methods[] = {
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000013603 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013604 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
Ezio Melotticda6b6d2012-02-26 09:39:55 +020013605 {"split", (PyCFunction) unicode_split, METH_VARARGS | METH_KEYWORDS, split__doc__},
13606 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS | METH_KEYWORDS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013607 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
13608 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
Benjamin Petersond5890c82012-01-14 13:23:30 -050013609 {"casefold", (PyCFunction) unicode_casefold, METH_NOARGS, casefold__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013610 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
13611 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
13612 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
Ezio Melotti745d54d2013-11-16 19:10:57 +020013613 {"expandtabs", (PyCFunction) unicode_expandtabs,
13614 METH_VARARGS | METH_KEYWORDS, expandtabs__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013615 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013616 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013617 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
13618 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
13619 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013620 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013621 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13622 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
13623 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013624 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013625 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Ezio Melotti745d54d2013-11-16 19:10:57 +020013626 {"splitlines", (PyCFunction) unicode_splitlines,
13627 METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013628 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013629 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
13630 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
13631 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
13632 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13633 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
13634 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
13635 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
13636 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
13637 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
13638 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
13639 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
13640 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
13641 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
13642 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000013643 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000013644 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013645 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000013646 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000013647 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000013648 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Larry Hastings31826802013-10-19 00:09:25 -070013649 UNICODE_MAKETRANS_METHODDEF
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013650 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000013651#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013652 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013653 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013654#endif
13655
Benjamin Peterson14339b62009-01-31 16:36:08 +000013656 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013657 {NULL, NULL}
13658};
13659
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013660static PyObject *
13661unicode_mod(PyObject *v, PyObject *w)
13662{
Brian Curtindfc80e32011-08-10 20:28:54 -050013663 if (!PyUnicode_Check(v))
13664 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000013665 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013666}
13667
13668static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013669 0, /*nb_add*/
13670 0, /*nb_subtract*/
13671 0, /*nb_multiply*/
13672 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013673};
13674
Guido van Rossumd57fd912000-03-10 22:53:23 +000013675static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013676 (lenfunc) unicode_length, /* sq_length */
13677 PyUnicode_Concat, /* sq_concat */
13678 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13679 (ssizeargfunc) unicode_getitem, /* sq_item */
13680 0, /* sq_slice */
13681 0, /* sq_ass_item */
13682 0, /* sq_ass_slice */
13683 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013684};
13685
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013686static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013687unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013688{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013689 if (PyUnicode_READY(self) == -1)
13690 return NULL;
13691
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013692 if (PyIndex_Check(item)) {
13693 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013694 if (i == -1 && PyErr_Occurred())
13695 return NULL;
13696 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013697 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013698 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013699 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000013700 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013701 PyObject *result;
13702 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013703 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013704 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013705
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013706 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000013707 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013708 return NULL;
13709 }
13710
13711 if (slicelength <= 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020013712 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013713 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010013714 slicelength == PyUnicode_GET_LENGTH(self)) {
13715 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000013716 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013717 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013718 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013719 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013720 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013721 src_kind = PyUnicode_KIND(self);
13722 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020013723 if (!PyUnicode_IS_ASCII(self)) {
13724 kind_limit = kind_maxchar_limit(src_kind);
13725 max_char = 0;
13726 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13727 ch = PyUnicode_READ(src_kind, src_data, cur);
13728 if (ch > max_char) {
13729 max_char = ch;
13730 if (max_char >= kind_limit)
13731 break;
13732 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013733 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013734 }
Victor Stinner55c99112011-10-13 01:17:06 +020013735 else
13736 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013737 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013738 if (result == NULL)
13739 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013740 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013741 dest_data = PyUnicode_DATA(result);
13742
13743 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013744 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13745 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013746 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013747 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013748 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013749 } else {
13750 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13751 return NULL;
13752 }
13753}
13754
13755static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013756 (lenfunc)unicode_length, /* mp_length */
13757 (binaryfunc)unicode_subscript, /* mp_subscript */
13758 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013759};
13760
Guido van Rossumd57fd912000-03-10 22:53:23 +000013761
Guido van Rossumd57fd912000-03-10 22:53:23 +000013762/* Helpers for PyUnicode_Format() */
13763
Victor Stinnera47082312012-10-04 02:19:54 +020013764struct unicode_formatter_t {
13765 PyObject *args;
13766 int args_owned;
13767 Py_ssize_t arglen, argidx;
13768 PyObject *dict;
13769
13770 enum PyUnicode_Kind fmtkind;
13771 Py_ssize_t fmtcnt, fmtpos;
13772 void *fmtdata;
13773 PyObject *fmtstr;
13774
13775 _PyUnicodeWriter writer;
13776};
13777
13778struct unicode_format_arg_t {
13779 Py_UCS4 ch;
13780 int flags;
13781 Py_ssize_t width;
13782 int prec;
13783 int sign;
13784};
13785
Guido van Rossumd57fd912000-03-10 22:53:23 +000013786static PyObject *
Victor Stinnera47082312012-10-04 02:19:54 +020013787unicode_format_getnextarg(struct unicode_formatter_t *ctx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013788{
Victor Stinnera47082312012-10-04 02:19:54 +020013789 Py_ssize_t argidx = ctx->argidx;
13790
13791 if (argidx < ctx->arglen) {
13792 ctx->argidx++;
13793 if (ctx->arglen < 0)
13794 return ctx->args;
Benjamin Peterson29060642009-01-31 22:14:21 +000013795 else
Victor Stinnera47082312012-10-04 02:19:54 +020013796 return PyTuple_GetItem(ctx->args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013797 }
13798 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013799 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013800 return NULL;
13801}
13802
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013803/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013804
Victor Stinnera47082312012-10-04 02:19:54 +020013805/* Format a float into the writer if the writer is not NULL, or into *p_output
13806 otherwise.
13807
13808 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +020013809static int
Victor Stinnera47082312012-10-04 02:19:54 +020013810formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
13811 PyObject **p_output,
13812 _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013813{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013814 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013815 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013816 Py_ssize_t len;
Victor Stinnera47082312012-10-04 02:19:54 +020013817 int prec;
13818 int dtoa_flags;
Tim Petersced69f82003-09-16 20:30:58 +000013819
Guido van Rossumd57fd912000-03-10 22:53:23 +000013820 x = PyFloat_AsDouble(v);
13821 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020013822 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013823
Victor Stinnera47082312012-10-04 02:19:54 +020013824 prec = arg->prec;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013825 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013826 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000013827
Victor Stinnera47082312012-10-04 02:19:54 +020013828 if (arg->flags & F_ALT)
13829 dtoa_flags = Py_DTSF_ALT;
13830 else
13831 dtoa_flags = 0;
13832 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013833 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020013834 return -1;
13835 len = strlen(p);
13836 if (writer) {
Victor Stinner4a587072013-11-19 12:54:53 +010013837 if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
Christian Heimesf4f99392012-09-10 11:48:41 +020013838 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013839 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020013840 }
Victor Stinnerd3f08822012-05-29 12:57:52 +020013841 }
13842 else
13843 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000013844 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013845 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013846}
13847
Victor Stinnerd0880d52012-04-27 23:40:13 +020013848/* formatlong() emulates the format codes d, u, o, x and X, and
13849 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
13850 * Python's regular ints.
13851 * Return value: a new PyUnicodeObject*, or NULL if error.
13852 * The output string is of the form
13853 * "-"? ("0x" | "0X")? digit+
13854 * "0x"/"0X" are present only for x and X conversions, with F_ALT
13855 * set in flags. The case of hex digits will be correct,
13856 * There will be at least prec digits, zero-filled on the left if
13857 * necessary to get that many.
13858 * val object to be converted
13859 * flags bitmask of format flags; only F_ALT is looked at
13860 * prec minimum number of digits; 0-fill on left if needed
13861 * type a character in [duoxX]; u acts the same as d
13862 *
13863 * CAUTION: o, x and X conversions on regular ints can never
13864 * produce a '-' sign, but can for Python's unbounded ints.
13865 */
Tim Peters38fd5b62000-09-21 05:43:11 +000013866static PyObject*
Victor Stinnera47082312012-10-04 02:19:54 +020013867formatlong(PyObject *val, struct unicode_format_arg_t *arg)
Tim Peters38fd5b62000-09-21 05:43:11 +000013868{
Victor Stinnerd0880d52012-04-27 23:40:13 +020013869 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013870 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013871 Py_ssize_t i;
13872 int sign; /* 1 if '-', else 0 */
13873 int len; /* number of characters */
13874 Py_ssize_t llen;
13875 int numdigits; /* len == numnondigits + numdigits */
13876 int numnondigits = 0;
Victor Stinnera47082312012-10-04 02:19:54 +020013877 int prec = arg->prec;
13878 int type = arg->ch;
Tim Peters38fd5b62000-09-21 05:43:11 +000013879
Victor Stinnerd0880d52012-04-27 23:40:13 +020013880 /* Avoid exceeding SSIZE_T_MAX */
13881 if (prec > INT_MAX-3) {
13882 PyErr_SetString(PyExc_OverflowError,
13883 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000013884 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013885 }
13886
13887 assert(PyLong_Check(val));
13888
13889 switch (type) {
Victor Stinner621ef3d2012-10-02 00:33:47 +020013890 default:
13891 assert(!"'type' not in [diuoxX]");
Victor Stinnerd0880d52012-04-27 23:40:13 +020013892 case 'd':
Victor Stinner621ef3d2012-10-02 00:33:47 +020013893 case 'i':
Victor Stinnerd0880d52012-04-27 23:40:13 +020013894 case 'u':
Ethan Furmanfb137212013-08-31 10:18:55 -070013895 /* int and int subclasses should print numerically when a numeric */
13896 /* format code is used (see issue18780) */
13897 result = PyNumber_ToBase(val, 10);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013898 break;
13899 case 'o':
13900 numnondigits = 2;
13901 result = PyNumber_ToBase(val, 8);
13902 break;
13903 case 'x':
13904 case 'X':
13905 numnondigits = 2;
13906 result = PyNumber_ToBase(val, 16);
13907 break;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013908 }
13909 if (!result)
13910 return NULL;
13911
13912 assert(unicode_modifiable(result));
13913 assert(PyUnicode_IS_READY(result));
13914 assert(PyUnicode_IS_ASCII(result));
13915
13916 /* To modify the string in-place, there can only be one reference. */
13917 if (Py_REFCNT(result) != 1) {
Christian Heimesd47802e2013-06-29 21:33:36 +020013918 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013919 PyErr_BadInternalCall();
13920 return NULL;
13921 }
13922 buf = PyUnicode_DATA(result);
13923 llen = PyUnicode_GET_LENGTH(result);
13924 if (llen > INT_MAX) {
Christian Heimesd47802e2013-06-29 21:33:36 +020013925 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013926 PyErr_SetString(PyExc_ValueError,
13927 "string too large in _PyBytes_FormatLong");
13928 return NULL;
13929 }
13930 len = (int)llen;
13931 sign = buf[0] == '-';
13932 numnondigits += sign;
13933 numdigits = len - numnondigits;
13934 assert(numdigits > 0);
13935
13936 /* Get rid of base marker unless F_ALT */
Victor Stinnera47082312012-10-04 02:19:54 +020013937 if (((arg->flags & F_ALT) == 0 &&
Victor Stinnerd0880d52012-04-27 23:40:13 +020013938 (type == 'o' || type == 'x' || type == 'X'))) {
13939 assert(buf[sign] == '0');
13940 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
13941 buf[sign+1] == 'o');
13942 numnondigits -= 2;
13943 buf += 2;
13944 len -= 2;
13945 if (sign)
13946 buf[0] = '-';
13947 assert(len == numnondigits + numdigits);
13948 assert(numdigits > 0);
13949 }
13950
13951 /* Fill with leading zeroes to meet minimum width. */
13952 if (prec > numdigits) {
13953 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
13954 numnondigits + prec);
13955 char *b1;
13956 if (!r1) {
13957 Py_DECREF(result);
13958 return NULL;
13959 }
13960 b1 = PyBytes_AS_STRING(r1);
13961 for (i = 0; i < numnondigits; ++i)
13962 *b1++ = *buf++;
13963 for (i = 0; i < prec - numdigits; i++)
13964 *b1++ = '0';
13965 for (i = 0; i < numdigits; i++)
13966 *b1++ = *buf++;
13967 *b1 = '\0';
13968 Py_DECREF(result);
13969 result = r1;
13970 buf = PyBytes_AS_STRING(result);
13971 len = numnondigits + prec;
13972 }
13973
13974 /* Fix up case for hex conversions. */
13975 if (type == 'X') {
13976 /* Need to convert all lower case letters to upper case.
13977 and need to convert 0x to 0X (and -0x to -0X). */
13978 for (i = 0; i < len; i++)
13979 if (buf[i] >= 'a' && buf[i] <= 'x')
13980 buf[i] -= 'a'-'A';
13981 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020013982 if (!PyUnicode_Check(result)
13983 || buf != PyUnicode_DATA(result)) {
Victor Stinnerd0880d52012-04-27 23:40:13 +020013984 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013985 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013986 Py_DECREF(result);
13987 result = unicode;
13988 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020013989 else if (len != PyUnicode_GET_LENGTH(result)) {
13990 if (PyUnicode_Resize(&result, len) < 0)
13991 Py_CLEAR(result);
13992 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000013993 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000013994}
13995
Ethan Furmandf3ed242014-01-05 06:50:30 -080013996/* Format an integer or a float as an integer.
Victor Stinner621ef3d2012-10-02 00:33:47 +020013997 * Return 1 if the number has been formatted into the writer,
Victor Stinnera47082312012-10-04 02:19:54 +020013998 * 0 if the number has been formatted into *p_output
Victor Stinner621ef3d2012-10-02 00:33:47 +020013999 * -1 and raise an exception on error */
14000static int
Victor Stinnera47082312012-10-04 02:19:54 +020014001mainformatlong(PyObject *v,
14002 struct unicode_format_arg_t *arg,
14003 PyObject **p_output,
14004 _PyUnicodeWriter *writer)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014005{
14006 PyObject *iobj, *res;
Victor Stinnera47082312012-10-04 02:19:54 +020014007 char type = (char)arg->ch;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014008
14009 if (!PyNumber_Check(v))
14010 goto wrongtype;
14011
Ethan Furmanf9bba9c2014-01-11 23:20:58 -080014012 /* make sure number is a type of integer */
Ethan Furmana70805e2014-01-12 08:42:35 -080014013 /* if not, issue deprecation warning for now */
Victor Stinner621ef3d2012-10-02 00:33:47 +020014014 if (!PyLong_Check(v)) {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014015 if (type == 'o' || type == 'x' || type == 'X') {
14016 iobj = PyNumber_Index(v);
14017 if (iobj == NULL) {
Ethan Furmanf9bba9c2014-01-11 23:20:58 -080014018 PyErr_Clear();
14019 if (PyErr_WarnEx(PyExc_DeprecationWarning,
14020 "automatic int conversions have been deprecated",
14021 1)) {
14022 return -1;
14023 }
14024 iobj = PyNumber_Long(v);
14025 if (iobj == NULL ) {
14026 if (PyErr_ExceptionMatches(PyExc_TypeError))
14027 goto wrongtype;
14028 return -1;
14029 }
Ethan Furmandf3ed242014-01-05 06:50:30 -080014030 }
14031 }
14032 else {
14033 iobj = PyNumber_Long(v);
14034 if (iobj == NULL ) {
14035 if (PyErr_ExceptionMatches(PyExc_TypeError))
14036 goto wrongtype;
14037 return -1;
14038 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014039 }
14040 assert(PyLong_Check(iobj));
14041 }
14042 else {
14043 iobj = v;
14044 Py_INCREF(iobj);
14045 }
14046
14047 if (PyLong_CheckExact(v)
Victor Stinnera47082312012-10-04 02:19:54 +020014048 && arg->width == -1 && arg->prec == -1
14049 && !(arg->flags & (F_SIGN | F_BLANK))
14050 && type != 'X')
Victor Stinner621ef3d2012-10-02 00:33:47 +020014051 {
14052 /* Fast path */
Victor Stinnera47082312012-10-04 02:19:54 +020014053 int alternate = arg->flags & F_ALT;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014054 int base;
14055
Victor Stinnera47082312012-10-04 02:19:54 +020014056 switch(type)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014057 {
14058 default:
14059 assert(0 && "'type' not in [diuoxX]");
14060 case 'd':
14061 case 'i':
14062 case 'u':
14063 base = 10;
14064 break;
14065 case 'o':
14066 base = 8;
14067 break;
14068 case 'x':
14069 case 'X':
14070 base = 16;
14071 break;
14072 }
14073
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014074 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14075 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014076 return -1;
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014077 }
14078 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014079 return 1;
14080 }
14081
Victor Stinnera47082312012-10-04 02:19:54 +020014082 res = formatlong(iobj, arg);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014083 Py_DECREF(iobj);
14084 if (res == NULL)
14085 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014086 *p_output = res;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014087 return 0;
14088
14089wrongtype:
14090 PyErr_Format(PyExc_TypeError,
14091 "%%%c format: a number is required, "
Victor Stinnera47082312012-10-04 02:19:54 +020014092 "not %.200s",
14093 type, Py_TYPE(v)->tp_name);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014094 return -1;
14095}
14096
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014097static Py_UCS4
14098formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014099{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014100 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014101 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014102 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014103 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000014104 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014105 goto onError;
14106 }
14107 else {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014108 PyObject *iobj;
Benjamin Peterson29060642009-01-31 22:14:21 +000014109 long x;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014110 /* make sure number is a type of integer */
Ethan Furmana70805e2014-01-12 08:42:35 -080014111 /* if not, issue deprecation warning for now */
Ethan Furmandf3ed242014-01-05 06:50:30 -080014112 if (!PyLong_Check(v)) {
14113 iobj = PyNumber_Index(v);
14114 if (iobj == NULL) {
Ethan Furmanf9bba9c2014-01-11 23:20:58 -080014115 PyErr_Clear();
14116 if (PyErr_WarnEx(PyExc_DeprecationWarning,
14117 "automatic int conversions have been deprecated",
14118 1)) {
14119 return -1;
14120 }
14121 iobj = PyNumber_Long(v);
14122 if (iobj == NULL ) {
14123 if (PyErr_ExceptionMatches(PyExc_TypeError))
14124 goto onError;
14125 return -1;
14126 }
Ethan Furmandf3ed242014-01-05 06:50:30 -080014127 }
14128 v = iobj;
14129 Py_DECREF(iobj);
14130 }
14131 /* Integer input truncated to a character */
Benjamin Peterson29060642009-01-31 22:14:21 +000014132 x = PyLong_AsLong(v);
14133 if (x == -1 && PyErr_Occurred())
14134 goto onError;
14135
Victor Stinner8faf8212011-12-08 22:14:11 +010014136 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014137 PyErr_SetString(PyExc_OverflowError,
14138 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014139 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000014140 }
14141
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014142 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014143 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014144
Benjamin Peterson29060642009-01-31 22:14:21 +000014145 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014146 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014147 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014148 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014149}
14150
Victor Stinnera47082312012-10-04 02:19:54 +020014151/* Parse options of an argument: flags, width, precision.
14152 Handle also "%(name)" syntax.
14153
14154 Return 0 if the argument has been formatted into arg->str.
14155 Return 1 if the argument has been written into ctx->writer,
14156 Raise an exception and return -1 on error. */
14157static int
14158unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14159 struct unicode_format_arg_t *arg)
14160{
14161#define FORMAT_READ(ctx) \
14162 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14163
14164 PyObject *v;
14165
Victor Stinnera47082312012-10-04 02:19:54 +020014166 if (arg->ch == '(') {
14167 /* Get argument value from a dictionary. Example: "%(name)s". */
14168 Py_ssize_t keystart;
14169 Py_ssize_t keylen;
14170 PyObject *key;
14171 int pcount = 1;
14172
14173 if (ctx->dict == NULL) {
14174 PyErr_SetString(PyExc_TypeError,
14175 "format requires a mapping");
14176 return -1;
14177 }
14178 ++ctx->fmtpos;
14179 --ctx->fmtcnt;
14180 keystart = ctx->fmtpos;
14181 /* Skip over balanced parentheses */
14182 while (pcount > 0 && --ctx->fmtcnt >= 0) {
14183 arg->ch = FORMAT_READ(ctx);
14184 if (arg->ch == ')')
14185 --pcount;
14186 else if (arg->ch == '(')
14187 ++pcount;
14188 ctx->fmtpos++;
14189 }
14190 keylen = ctx->fmtpos - keystart - 1;
14191 if (ctx->fmtcnt < 0 || pcount > 0) {
14192 PyErr_SetString(PyExc_ValueError,
14193 "incomplete format key");
14194 return -1;
14195 }
14196 key = PyUnicode_Substring(ctx->fmtstr,
14197 keystart, keystart + keylen);
14198 if (key == NULL)
14199 return -1;
14200 if (ctx->args_owned) {
14201 Py_DECREF(ctx->args);
14202 ctx->args_owned = 0;
14203 }
14204 ctx->args = PyObject_GetItem(ctx->dict, key);
14205 Py_DECREF(key);
14206 if (ctx->args == NULL)
14207 return -1;
14208 ctx->args_owned = 1;
14209 ctx->arglen = -1;
14210 ctx->argidx = -2;
14211 }
14212
14213 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
Victor Stinnera47082312012-10-04 02:19:54 +020014214 while (--ctx->fmtcnt >= 0) {
14215 arg->ch = FORMAT_READ(ctx);
14216 ctx->fmtpos++;
14217 switch (arg->ch) {
14218 case '-': arg->flags |= F_LJUST; continue;
14219 case '+': arg->flags |= F_SIGN; continue;
14220 case ' ': arg->flags |= F_BLANK; continue;
14221 case '#': arg->flags |= F_ALT; continue;
14222 case '0': arg->flags |= F_ZERO; continue;
14223 }
14224 break;
14225 }
14226
14227 /* Parse width. Example: "%10s" => width=10 */
Victor Stinnera47082312012-10-04 02:19:54 +020014228 if (arg->ch == '*') {
14229 v = unicode_format_getnextarg(ctx);
14230 if (v == NULL)
14231 return -1;
14232 if (!PyLong_Check(v)) {
14233 PyErr_SetString(PyExc_TypeError,
14234 "* wants int");
14235 return -1;
14236 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014237 arg->width = PyLong_AsSsize_t(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014238 if (arg->width == -1 && PyErr_Occurred())
14239 return -1;
14240 if (arg->width < 0) {
14241 arg->flags |= F_LJUST;
14242 arg->width = -arg->width;
14243 }
14244 if (--ctx->fmtcnt >= 0) {
14245 arg->ch = FORMAT_READ(ctx);
14246 ctx->fmtpos++;
14247 }
14248 }
14249 else if (arg->ch >= '0' && arg->ch <= '9') {
14250 arg->width = arg->ch - '0';
14251 while (--ctx->fmtcnt >= 0) {
14252 arg->ch = FORMAT_READ(ctx);
14253 ctx->fmtpos++;
14254 if (arg->ch < '0' || arg->ch > '9')
14255 break;
14256 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14257 mixing signed and unsigned comparison. Since arg->ch is between
14258 '0' and '9', casting to int is safe. */
14259 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14260 PyErr_SetString(PyExc_ValueError,
14261 "width too big");
14262 return -1;
14263 }
14264 arg->width = arg->width*10 + (arg->ch - '0');
14265 }
14266 }
14267
14268 /* Parse precision. Example: "%.3f" => prec=3 */
Victor Stinnera47082312012-10-04 02:19:54 +020014269 if (arg->ch == '.') {
14270 arg->prec = 0;
14271 if (--ctx->fmtcnt >= 0) {
14272 arg->ch = FORMAT_READ(ctx);
14273 ctx->fmtpos++;
14274 }
14275 if (arg->ch == '*') {
14276 v = unicode_format_getnextarg(ctx);
14277 if (v == NULL)
14278 return -1;
14279 if (!PyLong_Check(v)) {
14280 PyErr_SetString(PyExc_TypeError,
14281 "* wants int");
14282 return -1;
14283 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014284 arg->prec = _PyLong_AsInt(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014285 if (arg->prec == -1 && PyErr_Occurred())
14286 return -1;
14287 if (arg->prec < 0)
14288 arg->prec = 0;
14289 if (--ctx->fmtcnt >= 0) {
14290 arg->ch = FORMAT_READ(ctx);
14291 ctx->fmtpos++;
14292 }
14293 }
14294 else if (arg->ch >= '0' && arg->ch <= '9') {
14295 arg->prec = arg->ch - '0';
14296 while (--ctx->fmtcnt >= 0) {
14297 arg->ch = FORMAT_READ(ctx);
14298 ctx->fmtpos++;
14299 if (arg->ch < '0' || arg->ch > '9')
14300 break;
14301 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
14302 PyErr_SetString(PyExc_ValueError,
Victor Stinner3921e902012-10-06 23:05:00 +020014303 "precision too big");
Victor Stinnera47082312012-10-04 02:19:54 +020014304 return -1;
14305 }
14306 arg->prec = arg->prec*10 + (arg->ch - '0');
14307 }
14308 }
14309 }
14310
14311 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
14312 if (ctx->fmtcnt >= 0) {
14313 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
14314 if (--ctx->fmtcnt >= 0) {
14315 arg->ch = FORMAT_READ(ctx);
14316 ctx->fmtpos++;
14317 }
14318 }
14319 }
14320 if (ctx->fmtcnt < 0) {
14321 PyErr_SetString(PyExc_ValueError,
14322 "incomplete format");
14323 return -1;
14324 }
14325 return 0;
14326
14327#undef FORMAT_READ
14328}
14329
14330/* Format one argument. Supported conversion specifiers:
14331
14332 - "s", "r", "a": any type
Ethan Furmandf3ed242014-01-05 06:50:30 -080014333 - "i", "d", "u": int or float
14334 - "o", "x", "X": int
Victor Stinnera47082312012-10-04 02:19:54 +020014335 - "e", "E", "f", "F", "g", "G": float
14336 - "c": int or str (1 character)
14337
Victor Stinner8dbd4212012-12-04 09:30:24 +010014338 When possible, the output is written directly into the Unicode writer
14339 (ctx->writer). A string is created when padding is required.
14340
Victor Stinnera47082312012-10-04 02:19:54 +020014341 Return 0 if the argument has been formatted into *p_str,
14342 1 if the argument has been written into ctx->writer,
Victor Stinner8dbd4212012-12-04 09:30:24 +010014343 -1 on error. */
Victor Stinnera47082312012-10-04 02:19:54 +020014344static int
14345unicode_format_arg_format(struct unicode_formatter_t *ctx,
14346 struct unicode_format_arg_t *arg,
14347 PyObject **p_str)
14348{
14349 PyObject *v;
14350 _PyUnicodeWriter *writer = &ctx->writer;
14351
14352 if (ctx->fmtcnt == 0)
14353 ctx->writer.overallocate = 0;
14354
14355 if (arg->ch == '%') {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014356 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014357 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014358 return 1;
14359 }
14360
14361 v = unicode_format_getnextarg(ctx);
14362 if (v == NULL)
14363 return -1;
14364
Victor Stinnera47082312012-10-04 02:19:54 +020014365
14366 switch (arg->ch) {
Victor Stinnera47082312012-10-04 02:19:54 +020014367 case 's':
14368 case 'r':
14369 case 'a':
14370 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
14371 /* Fast path */
14372 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
14373 return -1;
14374 return 1;
14375 }
14376
14377 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
14378 *p_str = v;
14379 Py_INCREF(*p_str);
14380 }
14381 else {
14382 if (arg->ch == 's')
14383 *p_str = PyObject_Str(v);
14384 else if (arg->ch == 'r')
14385 *p_str = PyObject_Repr(v);
14386 else
14387 *p_str = PyObject_ASCII(v);
14388 }
14389 break;
14390
14391 case 'i':
14392 case 'd':
14393 case 'u':
14394 case 'o':
14395 case 'x':
14396 case 'X':
14397 {
14398 int ret = mainformatlong(v, arg, p_str, writer);
14399 if (ret != 0)
14400 return ret;
14401 arg->sign = 1;
14402 break;
14403 }
14404
14405 case 'e':
14406 case 'E':
14407 case 'f':
14408 case 'F':
14409 case 'g':
14410 case 'G':
14411 if (arg->width == -1 && arg->prec == -1
14412 && !(arg->flags & (F_SIGN | F_BLANK)))
14413 {
14414 /* Fast path */
14415 if (formatfloat(v, arg, NULL, writer) == -1)
14416 return -1;
14417 return 1;
14418 }
14419
14420 arg->sign = 1;
14421 if (formatfloat(v, arg, p_str, NULL) == -1)
14422 return -1;
14423 break;
14424
14425 case 'c':
14426 {
14427 Py_UCS4 ch = formatchar(v);
14428 if (ch == (Py_UCS4) -1)
14429 return -1;
14430 if (arg->width == -1 && arg->prec == -1) {
14431 /* Fast path */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014432 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014433 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014434 return 1;
14435 }
14436 *p_str = PyUnicode_FromOrdinal(ch);
14437 break;
14438 }
14439
14440 default:
14441 PyErr_Format(PyExc_ValueError,
14442 "unsupported format character '%c' (0x%x) "
14443 "at index %zd",
14444 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
14445 (int)arg->ch,
14446 ctx->fmtpos - 1);
14447 return -1;
14448 }
14449 if (*p_str == NULL)
14450 return -1;
14451 assert (PyUnicode_Check(*p_str));
14452 return 0;
14453}
14454
14455static int
14456unicode_format_arg_output(struct unicode_formatter_t *ctx,
14457 struct unicode_format_arg_t *arg,
14458 PyObject *str)
14459{
14460 Py_ssize_t len;
14461 enum PyUnicode_Kind kind;
14462 void *pbuf;
14463 Py_ssize_t pindex;
14464 Py_UCS4 signchar;
14465 Py_ssize_t buflen;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014466 Py_UCS4 maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014467 Py_ssize_t sublen;
14468 _PyUnicodeWriter *writer = &ctx->writer;
14469 Py_UCS4 fill;
14470
14471 fill = ' ';
14472 if (arg->sign && arg->flags & F_ZERO)
14473 fill = '0';
14474
14475 if (PyUnicode_READY(str) == -1)
14476 return -1;
14477
14478 len = PyUnicode_GET_LENGTH(str);
14479 if ((arg->width == -1 || arg->width <= len)
14480 && (arg->prec == -1 || arg->prec >= len)
14481 && !(arg->flags & (F_SIGN | F_BLANK)))
14482 {
14483 /* Fast path */
14484 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
14485 return -1;
14486 return 0;
14487 }
14488
14489 /* Truncate the string for "s", "r" and "a" formats
14490 if the precision is set */
14491 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
14492 if (arg->prec >= 0 && len > arg->prec)
14493 len = arg->prec;
14494 }
14495
14496 /* Adjust sign and width */
14497 kind = PyUnicode_KIND(str);
14498 pbuf = PyUnicode_DATA(str);
14499 pindex = 0;
14500 signchar = '\0';
14501 if (arg->sign) {
14502 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
14503 if (ch == '-' || ch == '+') {
14504 signchar = ch;
14505 len--;
14506 pindex++;
14507 }
14508 else if (arg->flags & F_SIGN)
14509 signchar = '+';
14510 else if (arg->flags & F_BLANK)
14511 signchar = ' ';
14512 else
14513 arg->sign = 0;
14514 }
14515 if (arg->width < len)
14516 arg->width = len;
14517
14518 /* Prepare the writer */
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014519 maxchar = writer->maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014520 if (!(arg->flags & F_LJUST)) {
14521 if (arg->sign) {
14522 if ((arg->width-1) > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014523 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014524 }
14525 else {
14526 if (arg->width > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014527 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014528 }
14529 }
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014530 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
14531 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014532 maxchar = Py_MAX(maxchar, strmaxchar);
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014533 }
14534
Victor Stinnera47082312012-10-04 02:19:54 +020014535 buflen = arg->width;
14536 if (arg->sign && len == arg->width)
14537 buflen++;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014538 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
Victor Stinnera47082312012-10-04 02:19:54 +020014539 return -1;
14540
14541 /* Write the sign if needed */
14542 if (arg->sign) {
14543 if (fill != ' ') {
14544 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14545 writer->pos += 1;
14546 }
14547 if (arg->width > len)
14548 arg->width--;
14549 }
14550
14551 /* Write the numeric prefix for "x", "X" and "o" formats
14552 if the alternate form is used.
14553 For example, write "0x" for the "%#x" format. */
14554 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14555 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14556 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
14557 if (fill != ' ') {
14558 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14559 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14560 writer->pos += 2;
14561 pindex += 2;
14562 }
14563 arg->width -= 2;
14564 if (arg->width < 0)
14565 arg->width = 0;
14566 len -= 2;
14567 }
14568
14569 /* Pad left with the fill character if needed */
14570 if (arg->width > len && !(arg->flags & F_LJUST)) {
14571 sublen = arg->width - len;
14572 FILL(writer->kind, writer->data, fill, writer->pos, sublen);
14573 writer->pos += sublen;
14574 arg->width = len;
14575 }
14576
14577 /* If padding with spaces: write sign if needed and/or numeric prefix if
14578 the alternate form is used */
14579 if (fill == ' ') {
14580 if (arg->sign) {
14581 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14582 writer->pos += 1;
14583 }
14584 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14585 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14586 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
14587 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14588 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14589 writer->pos += 2;
14590 pindex += 2;
14591 }
14592 }
14593
14594 /* Write characters */
14595 if (len) {
14596 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14597 str, pindex, len);
14598 writer->pos += len;
14599 }
14600
14601 /* Pad right with the fill character if needed */
14602 if (arg->width > len) {
14603 sublen = arg->width - len;
14604 FILL(writer->kind, writer->data, ' ', writer->pos, sublen);
14605 writer->pos += sublen;
14606 }
14607 return 0;
14608}
14609
14610/* Helper of PyUnicode_Format(): format one arg.
14611 Return 0 on success, raise an exception and return -1 on error. */
14612static int
14613unicode_format_arg(struct unicode_formatter_t *ctx)
14614{
14615 struct unicode_format_arg_t arg;
14616 PyObject *str;
14617 int ret;
14618
Victor Stinner8dbd4212012-12-04 09:30:24 +010014619 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
14620 arg.flags = 0;
14621 arg.width = -1;
14622 arg.prec = -1;
14623 arg.sign = 0;
14624 str = NULL;
14625
Victor Stinnera47082312012-10-04 02:19:54 +020014626 ret = unicode_format_arg_parse(ctx, &arg);
14627 if (ret == -1)
14628 return -1;
14629
14630 ret = unicode_format_arg_format(ctx, &arg, &str);
14631 if (ret == -1)
14632 return -1;
14633
14634 if (ret != 1) {
14635 ret = unicode_format_arg_output(ctx, &arg, str);
14636 Py_DECREF(str);
14637 if (ret == -1)
14638 return -1;
14639 }
14640
14641 if (ctx->dict && (ctx->argidx < ctx->arglen) && arg.ch != '%') {
14642 PyErr_SetString(PyExc_TypeError,
14643 "not all arguments converted during string formatting");
14644 return -1;
14645 }
14646 return 0;
14647}
14648
Alexander Belopolsky40018472011-02-26 01:02:56 +000014649PyObject *
14650PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014651{
Victor Stinnera47082312012-10-04 02:19:54 +020014652 struct unicode_formatter_t ctx;
Tim Petersced69f82003-09-16 20:30:58 +000014653
Guido van Rossumd57fd912000-03-10 22:53:23 +000014654 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014655 PyErr_BadInternalCall();
14656 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014657 }
Victor Stinnera47082312012-10-04 02:19:54 +020014658
14659 ctx.fmtstr = PyUnicode_FromObject(format);
14660 if (ctx.fmtstr == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000014661 return NULL;
Victor Stinnera47082312012-10-04 02:19:54 +020014662 if (PyUnicode_READY(ctx.fmtstr) == -1) {
14663 Py_DECREF(ctx.fmtstr);
14664 return NULL;
14665 }
14666 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
14667 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
14668 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
14669 ctx.fmtpos = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014670
Victor Stinner8f674cc2013-04-17 23:02:17 +020014671 _PyUnicodeWriter_Init(&ctx.writer);
14672 ctx.writer.min_length = ctx.fmtcnt + 100;
14673 ctx.writer.overallocate = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014674
Guido van Rossumd57fd912000-03-10 22:53:23 +000014675 if (PyTuple_Check(args)) {
Victor Stinnera47082312012-10-04 02:19:54 +020014676 ctx.arglen = PyTuple_Size(args);
14677 ctx.argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014678 }
14679 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014680 ctx.arglen = -1;
14681 ctx.argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014682 }
Victor Stinnera47082312012-10-04 02:19:54 +020014683 ctx.args_owned = 0;
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040014684 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Victor Stinnera47082312012-10-04 02:19:54 +020014685 ctx.dict = args;
14686 else
14687 ctx.dict = NULL;
14688 ctx.args = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014689
Victor Stinnera47082312012-10-04 02:19:54 +020014690 while (--ctx.fmtcnt >= 0) {
14691 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
Victor Stinnercfc4c132013-04-03 01:48:39 +020014692 Py_ssize_t nonfmtpos;
Victor Stinnera47082312012-10-04 02:19:54 +020014693
14694 nonfmtpos = ctx.fmtpos++;
14695 while (ctx.fmtcnt >= 0 &&
14696 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
14697 ctx.fmtpos++;
14698 ctx.fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014699 }
Victor Stinnera47082312012-10-04 02:19:54 +020014700 if (ctx.fmtcnt < 0) {
14701 ctx.fmtpos--;
14702 ctx.writer.overallocate = 0;
Victor Stinnera0494432012-10-03 23:03:46 +020014703 }
Victor Stinneree4544c2012-05-09 22:24:08 +020014704
Victor Stinnercfc4c132013-04-03 01:48:39 +020014705 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
14706 nonfmtpos, ctx.fmtpos) < 0)
14707 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014708 }
14709 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014710 ctx.fmtpos++;
14711 if (unicode_format_arg(&ctx) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000014712 goto onError;
Victor Stinnera47082312012-10-04 02:19:54 +020014713 }
14714 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020014715
Victor Stinnera47082312012-10-04 02:19:54 +020014716 if (ctx.argidx < ctx.arglen && !ctx.dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014717 PyErr_SetString(PyExc_TypeError,
14718 "not all arguments converted during string formatting");
14719 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014720 }
14721
Victor Stinnera47082312012-10-04 02:19:54 +020014722 if (ctx.args_owned) {
14723 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014724 }
Victor Stinnera47082312012-10-04 02:19:54 +020014725 Py_DECREF(ctx.fmtstr);
14726 return _PyUnicodeWriter_Finish(&ctx.writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014727
Benjamin Peterson29060642009-01-31 22:14:21 +000014728 onError:
Victor Stinnera47082312012-10-04 02:19:54 +020014729 Py_DECREF(ctx.fmtstr);
14730 _PyUnicodeWriter_Dealloc(&ctx.writer);
14731 if (ctx.args_owned) {
14732 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014733 }
14734 return NULL;
14735}
14736
Jeremy Hylton938ace62002-07-17 16:30:39 +000014737static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000014738unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
14739
Tim Peters6d6c1a32001-08-02 04:15:00 +000014740static PyObject *
14741unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14742{
Benjamin Peterson29060642009-01-31 22:14:21 +000014743 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014744 static char *kwlist[] = {"object", "encoding", "errors", 0};
14745 char *encoding = NULL;
14746 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000014747
Benjamin Peterson14339b62009-01-31 16:36:08 +000014748 if (type != &PyUnicode_Type)
14749 return unicode_subtype_new(type, args, kwds);
14750 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000014751 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000014752 return NULL;
14753 if (x == NULL)
Serhiy Storchaka678db842013-01-26 12:16:36 +020014754 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson14339b62009-01-31 16:36:08 +000014755 if (encoding == NULL && errors == NULL)
14756 return PyObject_Str(x);
14757 else
Benjamin Peterson29060642009-01-31 22:14:21 +000014758 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000014759}
14760
Guido van Rossume023fe02001-08-30 03:12:59 +000014761static PyObject *
14762unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14763{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014764 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014765 Py_ssize_t length, char_size;
14766 int share_wstr, share_utf8;
14767 unsigned int kind;
14768 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000014769
Benjamin Peterson14339b62009-01-31 16:36:08 +000014770 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014771
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014772 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014773 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014774 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014775 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050014776 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060014777 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014778 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060014779 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014780
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014781 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014782 if (self == NULL) {
14783 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014784 return NULL;
14785 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014786 kind = PyUnicode_KIND(unicode);
14787 length = PyUnicode_GET_LENGTH(unicode);
14788
14789 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014790#ifdef Py_DEBUG
14791 _PyUnicode_HASH(self) = -1;
14792#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014793 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014794#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014795 _PyUnicode_STATE(self).interned = 0;
14796 _PyUnicode_STATE(self).kind = kind;
14797 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020014798 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014799 _PyUnicode_STATE(self).ready = 1;
14800 _PyUnicode_WSTR(self) = NULL;
14801 _PyUnicode_UTF8_LENGTH(self) = 0;
14802 _PyUnicode_UTF8(self) = NULL;
14803 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020014804 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014805
14806 share_utf8 = 0;
14807 share_wstr = 0;
14808 if (kind == PyUnicode_1BYTE_KIND) {
14809 char_size = 1;
14810 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
14811 share_utf8 = 1;
14812 }
14813 else if (kind == PyUnicode_2BYTE_KIND) {
14814 char_size = 2;
14815 if (sizeof(wchar_t) == 2)
14816 share_wstr = 1;
14817 }
14818 else {
14819 assert(kind == PyUnicode_4BYTE_KIND);
14820 char_size = 4;
14821 if (sizeof(wchar_t) == 4)
14822 share_wstr = 1;
14823 }
14824
14825 /* Ensure we won't overflow the length. */
14826 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
14827 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014828 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014829 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014830 data = PyObject_MALLOC((length + 1) * char_size);
14831 if (data == NULL) {
14832 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014833 goto onError;
14834 }
14835
Victor Stinnerc3c74152011-10-02 20:39:55 +020014836 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014837 if (share_utf8) {
14838 _PyUnicode_UTF8_LENGTH(self) = length;
14839 _PyUnicode_UTF8(self) = data;
14840 }
14841 if (share_wstr) {
14842 _PyUnicode_WSTR_LENGTH(self) = length;
14843 _PyUnicode_WSTR(self) = (wchar_t *)data;
14844 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014845
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014846 Py_MEMCPY(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020014847 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014848 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014849#ifdef Py_DEBUG
14850 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
14851#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020014852 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010014853 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014854
14855onError:
14856 Py_DECREF(unicode);
14857 Py_DECREF(self);
14858 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000014859}
14860
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000014861PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070014862"str(object='') -> str\n\
14863str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000014864\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100014865Create a new string object from the given object. If encoding or\n\
14866errors is specified, then the object must expose a data buffer\n\
14867that will be decoded using the given encoding and error handler.\n\
14868Otherwise, returns the result of object.__str__() (if defined)\n\
14869or repr(object).\n\
14870encoding defaults to sys.getdefaultencoding().\n\
14871errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000014872
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014873static PyObject *unicode_iter(PyObject *seq);
14874
Guido van Rossumd57fd912000-03-10 22:53:23 +000014875PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000014876 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014877 "str", /* tp_name */
14878 sizeof(PyUnicodeObject), /* tp_size */
14879 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014880 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014881 (destructor)unicode_dealloc, /* tp_dealloc */
14882 0, /* tp_print */
14883 0, /* tp_getattr */
14884 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014885 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014886 unicode_repr, /* tp_repr */
14887 &unicode_as_number, /* tp_as_number */
14888 &unicode_as_sequence, /* tp_as_sequence */
14889 &unicode_as_mapping, /* tp_as_mapping */
14890 (hashfunc) unicode_hash, /* tp_hash*/
14891 0, /* tp_call*/
14892 (reprfunc) unicode_str, /* tp_str */
14893 PyObject_GenericGetAttr, /* tp_getattro */
14894 0, /* tp_setattro */
14895 0, /* tp_as_buffer */
14896 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000014897 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014898 unicode_doc, /* tp_doc */
14899 0, /* tp_traverse */
14900 0, /* tp_clear */
14901 PyUnicode_RichCompare, /* tp_richcompare */
14902 0, /* tp_weaklistoffset */
14903 unicode_iter, /* tp_iter */
14904 0, /* tp_iternext */
14905 unicode_methods, /* tp_methods */
14906 0, /* tp_members */
14907 0, /* tp_getset */
14908 &PyBaseObject_Type, /* tp_base */
14909 0, /* tp_dict */
14910 0, /* tp_descr_get */
14911 0, /* tp_descr_set */
14912 0, /* tp_dictoffset */
14913 0, /* tp_init */
14914 0, /* tp_alloc */
14915 unicode_new, /* tp_new */
14916 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014917};
14918
14919/* Initialize the Unicode implementation */
14920
Victor Stinner3a50e702011-10-18 21:21:00 +020014921int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014922{
Thomas Wouters477c8d52006-05-27 19:21:47 +000014923 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014924 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000014925 0x000A, /* LINE FEED */
14926 0x000D, /* CARRIAGE RETURN */
14927 0x001C, /* FILE SEPARATOR */
14928 0x001D, /* GROUP SEPARATOR */
14929 0x001E, /* RECORD SEPARATOR */
14930 0x0085, /* NEXT LINE */
14931 0x2028, /* LINE SEPARATOR */
14932 0x2029, /* PARAGRAPH SEPARATOR */
14933 };
14934
Fred Drakee4315f52000-05-09 19:53:39 +000014935 /* Init the implementation */
Serhiy Storchaka678db842013-01-26 12:16:36 +020014936 _Py_INCREF_UNICODE_EMPTY();
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014937 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014938 Py_FatalError("Can't create empty string");
Serhiy Storchaka678db842013-01-26 12:16:36 +020014939 Py_DECREF(unicode_empty);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014940
Guido van Rossumcacfc072002-05-24 19:01:59 +000014941 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014942 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000014943
14944 /* initialize the linebreak bloom filter */
14945 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014946 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020014947 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014948
Christian Heimes26532f72013-07-20 14:57:16 +020014949 if (PyType_Ready(&EncodingMapType) < 0)
14950 Py_FatalError("Can't initialize encoding map type");
Victor Stinner3a50e702011-10-18 21:21:00 +020014951
Benjamin Petersonc4311282012-10-30 23:21:10 -040014952 if (PyType_Ready(&PyFieldNameIter_Type) < 0)
14953 Py_FatalError("Can't initialize field name iterator type");
14954
14955 if (PyType_Ready(&PyFormatterIter_Type) < 0)
14956 Py_FatalError("Can't initialize formatter iter type");
Benjamin Petersone8ea97f2012-10-30 23:27:52 -040014957
Victor Stinner3a50e702011-10-18 21:21:00 +020014958#ifdef HAVE_MBCS
14959 winver.dwOSVersionInfoSize = sizeof(winver);
14960 if (!GetVersionEx((OSVERSIONINFO*)&winver)) {
14961 PyErr_SetFromWindowsErr(0);
14962 return -1;
14963 }
14964#endif
14965 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014966}
14967
14968/* Finalize the Unicode implementation */
14969
Christian Heimesa156e092008-02-16 07:38:31 +000014970int
14971PyUnicode_ClearFreeList(void)
14972{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014973 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000014974}
14975
Guido van Rossumd57fd912000-03-10 22:53:23 +000014976void
Thomas Wouters78890102000-07-22 19:25:51 +000014977_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014978{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014979 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014980
Serhiy Storchaka05997252013-01-26 12:14:02 +020014981 Py_CLEAR(unicode_empty);
Barry Warsaw5b4c2282000-10-03 20:45:26 +000014982
Serhiy Storchaka05997252013-01-26 12:14:02 +020014983 for (i = 0; i < 256; i++)
14984 Py_CLEAR(unicode_latin1[i]);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020014985 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000014986 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000014987}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000014988
Walter Dörwald16807132007-05-25 13:52:07 +000014989void
14990PyUnicode_InternInPlace(PyObject **p)
14991{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020014992 PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014993 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020014994#ifdef Py_DEBUG
14995 assert(s != NULL);
14996 assert(_PyUnicode_CHECK(s));
14997#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000014998 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020014999 return;
15000#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015001 /* If it's a subclass, we don't really know what putting
15002 it in the interned dict might do. */
15003 if (!PyUnicode_CheckExact(s))
15004 return;
15005 if (PyUnicode_CHECK_INTERNED(s))
15006 return;
15007 if (interned == NULL) {
15008 interned = PyDict_New();
15009 if (interned == NULL) {
15010 PyErr_Clear(); /* Don't leave an exception */
15011 return;
15012 }
15013 }
15014 /* It might be that the GetItem call fails even
15015 though the key is present in the dictionary,
15016 namely when this happens during a stack overflow. */
15017 Py_ALLOW_RECURSION
Victor Stinner7931d9a2011-11-04 00:22:48 +010015018 t = PyDict_GetItem(interned, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015019 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000015020
Victor Stinnerf0335102013-04-14 19:13:03 +020015021 if (t) {
15022 Py_INCREF(t);
15023 Py_DECREF(*p);
15024 *p = t;
15025 return;
15026 }
Walter Dörwald16807132007-05-25 13:52:07 +000015027
Benjamin Peterson14339b62009-01-31 16:36:08 +000015028 PyThreadState_GET()->recursion_critical = 1;
Victor Stinner7931d9a2011-11-04 00:22:48 +010015029 if (PyDict_SetItem(interned, s, s) < 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015030 PyErr_Clear();
15031 PyThreadState_GET()->recursion_critical = 0;
15032 return;
15033 }
15034 PyThreadState_GET()->recursion_critical = 0;
15035 /* The two references in interned are not counted by refcnt.
15036 The deallocator will take care of this */
15037 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015038 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000015039}
15040
15041void
15042PyUnicode_InternImmortal(PyObject **p)
15043{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015044 PyUnicode_InternInPlace(p);
15045 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020015046 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015047 Py_INCREF(*p);
15048 }
Walter Dörwald16807132007-05-25 13:52:07 +000015049}
15050
15051PyObject *
15052PyUnicode_InternFromString(const char *cp)
15053{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015054 PyObject *s = PyUnicode_FromString(cp);
15055 if (s == NULL)
15056 return NULL;
15057 PyUnicode_InternInPlace(&s);
15058 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000015059}
15060
Alexander Belopolsky40018472011-02-26 01:02:56 +000015061void
15062_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000015063{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015064 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015065 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015066 Py_ssize_t i, n;
15067 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000015068
Benjamin Peterson14339b62009-01-31 16:36:08 +000015069 if (interned == NULL || !PyDict_Check(interned))
15070 return;
15071 keys = PyDict_Keys(interned);
15072 if (keys == NULL || !PyList_Check(keys)) {
15073 PyErr_Clear();
15074 return;
15075 }
Walter Dörwald16807132007-05-25 13:52:07 +000015076
Benjamin Peterson14339b62009-01-31 16:36:08 +000015077 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
15078 detector, interned unicode strings are not forcibly deallocated;
15079 rather, we give them their stolen references back, and then clear
15080 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000015081
Benjamin Peterson14339b62009-01-31 16:36:08 +000015082 n = PyList_GET_SIZE(keys);
15083 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000015084 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015085 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015086 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015087 if (PyUnicode_READY(s) == -1) {
15088 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015089 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015090 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015091 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015092 case SSTATE_NOT_INTERNED:
15093 /* XXX Shouldn't happen */
15094 break;
15095 case SSTATE_INTERNED_IMMORTAL:
15096 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015097 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015098 break;
15099 case SSTATE_INTERNED_MORTAL:
15100 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015101 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015102 break;
15103 default:
15104 Py_FatalError("Inconsistent interned string state.");
15105 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015106 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015107 }
15108 fprintf(stderr, "total size of all interned strings: "
15109 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
15110 "mortal/immortal\n", mortal_size, immortal_size);
15111 Py_DECREF(keys);
15112 PyDict_Clear(interned);
Serhiy Storchaka05997252013-01-26 12:14:02 +020015113 Py_CLEAR(interned);
Walter Dörwald16807132007-05-25 13:52:07 +000015114}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015115
15116
15117/********************* Unicode Iterator **************************/
15118
15119typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015120 PyObject_HEAD
15121 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015122 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015123} unicodeiterobject;
15124
15125static void
15126unicodeiter_dealloc(unicodeiterobject *it)
15127{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015128 _PyObject_GC_UNTRACK(it);
15129 Py_XDECREF(it->it_seq);
15130 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015131}
15132
15133static int
15134unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15135{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015136 Py_VISIT(it->it_seq);
15137 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015138}
15139
15140static PyObject *
15141unicodeiter_next(unicodeiterobject *it)
15142{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015143 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015144
Benjamin Peterson14339b62009-01-31 16:36:08 +000015145 assert(it != NULL);
15146 seq = it->it_seq;
15147 if (seq == NULL)
15148 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015149 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015150
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015151 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15152 int kind = PyUnicode_KIND(seq);
15153 void *data = PyUnicode_DATA(seq);
15154 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15155 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015156 if (item != NULL)
15157 ++it->it_index;
15158 return item;
15159 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015160
Benjamin Peterson14339b62009-01-31 16:36:08 +000015161 Py_DECREF(seq);
15162 it->it_seq = NULL;
15163 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015164}
15165
15166static PyObject *
15167unicodeiter_len(unicodeiterobject *it)
15168{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015169 Py_ssize_t len = 0;
15170 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020015171 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015172 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015173}
15174
15175PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15176
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015177static PyObject *
15178unicodeiter_reduce(unicodeiterobject *it)
15179{
15180 if (it->it_seq != NULL) {
Antoine Pitroua7013882012-04-05 00:04:20 +020015181 return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015182 it->it_seq, it->it_index);
15183 } else {
15184 PyObject *u = PyUnicode_FromUnicode(NULL, 0);
15185 if (u == NULL)
15186 return NULL;
Antoine Pitroua7013882012-04-05 00:04:20 +020015187 return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015188 }
15189}
15190
15191PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15192
15193static PyObject *
15194unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15195{
15196 Py_ssize_t index = PyLong_AsSsize_t(state);
15197 if (index == -1 && PyErr_Occurred())
15198 return NULL;
15199 if (index < 0)
15200 index = 0;
15201 it->it_index = index;
15202 Py_RETURN_NONE;
15203}
15204
15205PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15206
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015207static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015208 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000015209 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015210 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15211 reduce_doc},
15212 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
15213 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000015214 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015215};
15216
15217PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015218 PyVarObject_HEAD_INIT(&PyType_Type, 0)
15219 "str_iterator", /* tp_name */
15220 sizeof(unicodeiterobject), /* tp_basicsize */
15221 0, /* tp_itemsize */
15222 /* methods */
15223 (destructor)unicodeiter_dealloc, /* tp_dealloc */
15224 0, /* tp_print */
15225 0, /* tp_getattr */
15226 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000015227 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015228 0, /* tp_repr */
15229 0, /* tp_as_number */
15230 0, /* tp_as_sequence */
15231 0, /* tp_as_mapping */
15232 0, /* tp_hash */
15233 0, /* tp_call */
15234 0, /* tp_str */
15235 PyObject_GenericGetAttr, /* tp_getattro */
15236 0, /* tp_setattro */
15237 0, /* tp_as_buffer */
15238 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15239 0, /* tp_doc */
15240 (traverseproc)unicodeiter_traverse, /* tp_traverse */
15241 0, /* tp_clear */
15242 0, /* tp_richcompare */
15243 0, /* tp_weaklistoffset */
15244 PyObject_SelfIter, /* tp_iter */
15245 (iternextfunc)unicodeiter_next, /* tp_iternext */
15246 unicodeiter_methods, /* tp_methods */
15247 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015248};
15249
15250static PyObject *
15251unicode_iter(PyObject *seq)
15252{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015253 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015254
Benjamin Peterson14339b62009-01-31 16:36:08 +000015255 if (!PyUnicode_Check(seq)) {
15256 PyErr_BadInternalCall();
15257 return NULL;
15258 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015259 if (PyUnicode_READY(seq) == -1)
15260 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015261 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15262 if (it == NULL)
15263 return NULL;
15264 it->it_index = 0;
15265 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015266 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015267 _PyObject_GC_TRACK(it);
15268 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015269}
15270
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015271
15272size_t
15273Py_UNICODE_strlen(const Py_UNICODE *u)
15274{
15275 int res = 0;
15276 while(*u++)
15277 res++;
15278 return res;
15279}
15280
15281Py_UNICODE*
15282Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
15283{
15284 Py_UNICODE *u = s1;
15285 while ((*u++ = *s2++));
15286 return s1;
15287}
15288
15289Py_UNICODE*
15290Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15291{
15292 Py_UNICODE *u = s1;
15293 while ((*u++ = *s2++))
15294 if (n-- == 0)
15295 break;
15296 return s1;
15297}
15298
15299Py_UNICODE*
15300Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
15301{
15302 Py_UNICODE *u1 = s1;
15303 u1 += Py_UNICODE_strlen(u1);
15304 Py_UNICODE_strcpy(u1, s2);
15305 return s1;
15306}
15307
15308int
15309Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
15310{
15311 while (*s1 && *s2 && *s1 == *s2)
15312 s1++, s2++;
15313 if (*s1 && *s2)
15314 return (*s1 < *s2) ? -1 : +1;
15315 if (*s1)
15316 return 1;
15317 if (*s2)
15318 return -1;
15319 return 0;
15320}
15321
15322int
15323Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15324{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015325 Py_UNICODE u1, u2;
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015326 for (; n != 0; n--) {
15327 u1 = *s1;
15328 u2 = *s2;
15329 if (u1 != u2)
15330 return (u1 < u2) ? -1 : +1;
15331 if (u1 == '\0')
15332 return 0;
15333 s1++;
15334 s2++;
15335 }
15336 return 0;
15337}
15338
15339Py_UNICODE*
15340Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
15341{
15342 const Py_UNICODE *p;
15343 for (p = s; *p; p++)
15344 if (*p == c)
15345 return (Py_UNICODE*)p;
15346 return NULL;
15347}
15348
15349Py_UNICODE*
15350Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
15351{
15352 const Py_UNICODE *p;
15353 p = s + Py_UNICODE_strlen(s);
15354 while (p != s) {
15355 p--;
15356 if (*p == c)
15357 return (Py_UNICODE*)p;
15358 }
15359 return NULL;
15360}
Victor Stinner331ea922010-08-10 16:37:20 +000015361
Victor Stinner71133ff2010-09-01 23:43:53 +000015362Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015363PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000015364{
Victor Stinner577db2c2011-10-11 22:12:48 +020015365 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015366 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000015367
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015368 if (!PyUnicode_Check(unicode)) {
15369 PyErr_BadArgument();
15370 return NULL;
15371 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015372 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020015373 if (u == NULL)
15374 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000015375 /* Ensure we won't overflow the size. */
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015376 if (len > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000015377 PyErr_NoMemory();
15378 return NULL;
15379 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015380 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000015381 size *= sizeof(Py_UNICODE);
15382 copy = PyMem_Malloc(size);
15383 if (copy == NULL) {
15384 PyErr_NoMemory();
15385 return NULL;
15386 }
Victor Stinner577db2c2011-10-11 22:12:48 +020015387 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000015388 return copy;
15389}
Martin v. Löwis5b222132007-06-10 09:51:05 +000015390
Georg Brandl66c221e2010-10-14 07:04:07 +000015391/* A _string module, to export formatter_parser and formatter_field_name_split
15392 to the string.Formatter class implemented in Python. */
15393
15394static PyMethodDef _string_methods[] = {
15395 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
15396 METH_O, PyDoc_STR("split the argument as a field name")},
15397 {"formatter_parser", (PyCFunction) formatter_parser,
15398 METH_O, PyDoc_STR("parse the argument as a format string")},
15399 {NULL, NULL}
15400};
15401
15402static struct PyModuleDef _string_module = {
15403 PyModuleDef_HEAD_INIT,
15404 "_string",
15405 PyDoc_STR("string helper module"),
15406 0,
15407 _string_methods,
15408 NULL,
15409 NULL,
15410 NULL,
15411 NULL
15412};
15413
15414PyMODINIT_FUNC
15415PyInit__string(void)
15416{
15417 return PyModule_Create(&_string_module);
15418}
15419
15420
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000015421#ifdef __cplusplus
15422}
15423#endif