blob: af70ede749d18b9179b38455c1b5ec4ca94db3a4 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000046#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000047#include <windows.h>
48#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000049
Larry Hastings61272b72014-01-07 12:41:53 -080050/*[clinic input]
Larry Hastingsc2047262014-01-25 20:43:29 -080051class str "PyUnicodeObject *" "&PyUnicode_Type"
Larry Hastings61272b72014-01-07 12:41:53 -080052[clinic start generated code]*/
Larry Hastings581ee362014-01-28 05:00:08 -080053/*[clinic end generated code: output=da39a3ee5e6b4b0d input=604e916854800fa8]*/
Larry Hastings44e2eaa2013-11-23 15:37:55 -080054
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000055/* --- Globals ------------------------------------------------------------
56
Serhiy Storchaka05997252013-01-26 12:14:02 +020057NOTE: In the interpreter's initialization phase, some globals are currently
58 initialized dynamically as needed. In the process Unicode objects may
59 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000060
61*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000062
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000063
64#ifdef __cplusplus
65extern "C" {
66#endif
67
Victor Stinner8faf8212011-12-08 22:14:11 +010068/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
69#define MAX_UNICODE 0x10ffff
70
Victor Stinner910337b2011-10-03 03:20:16 +020071#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020072# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020073#else
74# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
75#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020076
Victor Stinnere90fe6a2011-10-01 16:48:13 +020077#define _PyUnicode_UTF8(op) \
78 (((PyCompactUnicodeObject*)(op))->utf8)
79#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020080 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020081 assert(PyUnicode_IS_READY(op)), \
82 PyUnicode_IS_COMPACT_ASCII(op) ? \
83 ((char*)((PyASCIIObject*)(op) + 1)) : \
84 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +020085#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020086 (((PyCompactUnicodeObject*)(op))->utf8_length)
87#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020088 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020089 assert(PyUnicode_IS_READY(op)), \
90 PyUnicode_IS_COMPACT_ASCII(op) ? \
91 ((PyASCIIObject*)(op))->length : \
92 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +020093#define _PyUnicode_WSTR(op) \
94 (((PyASCIIObject*)(op))->wstr)
95#define _PyUnicode_WSTR_LENGTH(op) \
96 (((PyCompactUnicodeObject*)(op))->wstr_length)
97#define _PyUnicode_LENGTH(op) \
98 (((PyASCIIObject *)(op))->length)
99#define _PyUnicode_STATE(op) \
100 (((PyASCIIObject *)(op))->state)
101#define _PyUnicode_HASH(op) \
102 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200103#define _PyUnicode_KIND(op) \
104 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200105 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200106#define _PyUnicode_GET_LENGTH(op) \
107 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200108 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200109#define _PyUnicode_DATA_ANY(op) \
110 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200111
Victor Stinner910337b2011-10-03 03:20:16 +0200112#undef PyUnicode_READY
113#define PyUnicode_READY(op) \
114 (assert(_PyUnicode_CHECK(op)), \
115 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200116 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100117 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200118
Victor Stinnerc379ead2011-10-03 12:52:27 +0200119#define _PyUnicode_SHARE_UTF8(op) \
120 (assert(_PyUnicode_CHECK(op)), \
121 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
122 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
123#define _PyUnicode_SHARE_WSTR(op) \
124 (assert(_PyUnicode_CHECK(op)), \
125 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
126
Victor Stinner829c0ad2011-10-03 01:08:02 +0200127/* true if the Unicode object has an allocated UTF-8 memory block
128 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200129#define _PyUnicode_HAS_UTF8_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200130 ((!PyUnicode_IS_COMPACT_ASCII(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200131 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200132 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
133
Victor Stinner03490912011-10-03 23:45:12 +0200134/* true if the Unicode object has an allocated wstr memory block
135 (not shared with other data) */
136#define _PyUnicode_HAS_WSTR_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200137 ((_PyUnicode_WSTR(op) && \
Victor Stinner03490912011-10-03 23:45:12 +0200138 (!PyUnicode_IS_READY(op) || \
139 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
140
Victor Stinner910337b2011-10-03 03:20:16 +0200141/* Generic helper macro to convert characters of different types.
142 from_type and to_type have to be valid type names, begin and end
143 are pointers to the source characters which should be of type
144 "from_type *". to is a pointer of type "to_type *" and points to the
145 buffer where the result characters are written to. */
146#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
147 do { \
Victor Stinner4a587072013-11-19 12:54:53 +0100148 to_type *_to = (to_type *)(to); \
149 const from_type *_iter = (from_type *)(begin); \
150 const from_type *_end = (from_type *)(end); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200151 Py_ssize_t n = (_end) - (_iter); \
152 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200153 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200154 while (_iter < (_unrolled_end)) { \
155 _to[0] = (to_type) _iter[0]; \
156 _to[1] = (to_type) _iter[1]; \
157 _to[2] = (to_type) _iter[2]; \
158 _to[3] = (to_type) _iter[3]; \
159 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200160 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200161 while (_iter < (_end)) \
162 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200163 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200164
Walter Dörwald16807132007-05-25 13:52:07 +0000165/* This dictionary holds all interned unicode strings. Note that references
166 to strings in this dictionary are *not* counted in the string's ob_refcnt.
167 When the interned string reaches a refcnt of 0 the string deallocation
168 function will delete the reference from this dictionary.
169
170 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000171 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000172*/
Serhiy Storchaka05997252013-01-26 12:14:02 +0200173static PyObject *interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +0000174
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000175/* The empty Unicode object is shared to improve performance. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200176static PyObject *unicode_empty = NULL;
Serhiy Storchaka05997252013-01-26 12:14:02 +0200177
Serhiy Storchaka678db842013-01-26 12:16:36 +0200178#define _Py_INCREF_UNICODE_EMPTY() \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200179 do { \
180 if (unicode_empty != NULL) \
181 Py_INCREF(unicode_empty); \
182 else { \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200183 unicode_empty = PyUnicode_New(0, 0); \
184 if (unicode_empty != NULL) { \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200185 Py_INCREF(unicode_empty); \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200186 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
187 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200188 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200189 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000190
Serhiy Storchaka678db842013-01-26 12:16:36 +0200191#define _Py_RETURN_UNICODE_EMPTY() \
192 do { \
193 _Py_INCREF_UNICODE_EMPTY(); \
194 return unicode_empty; \
195 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000196
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200197/* Forward declaration */
198Py_LOCAL_INLINE(int)
199_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
200
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200201/* List of static strings. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200202static _Py_Identifier *static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200203
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000204/* Single character Unicode strings in the Latin-1 range are being
205 shared as well. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200206static PyObject *unicode_latin1[256] = {NULL};
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000207
Christian Heimes190d79e2008-01-30 11:58:22 +0000208/* Fast detection of the most frequent whitespace characters */
209const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000210 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000211/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000212/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000213/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000214/* case 0x000C: * FORM FEED */
215/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000216 0, 1, 1, 1, 1, 1, 0, 0,
217 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000218/* case 0x001C: * FILE SEPARATOR */
219/* case 0x001D: * GROUP SEPARATOR */
220/* case 0x001E: * RECORD SEPARATOR */
221/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000222 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000223/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000224 1, 0, 0, 0, 0, 0, 0, 0,
225 0, 0, 0, 0, 0, 0, 0, 0,
226 0, 0, 0, 0, 0, 0, 0, 0,
227 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000228
Benjamin Peterson14339b62009-01-31 16:36:08 +0000229 0, 0, 0, 0, 0, 0, 0, 0,
230 0, 0, 0, 0, 0, 0, 0, 0,
231 0, 0, 0, 0, 0, 0, 0, 0,
232 0, 0, 0, 0, 0, 0, 0, 0,
233 0, 0, 0, 0, 0, 0, 0, 0,
234 0, 0, 0, 0, 0, 0, 0, 0,
235 0, 0, 0, 0, 0, 0, 0, 0,
236 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000237};
238
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200239/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200240static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200241static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100242static int unicode_modifiable(PyObject *unicode);
243
Victor Stinnerfe226c02011-10-03 03:52:20 +0200244
Alexander Belopolsky40018472011-02-26 01:02:56 +0000245static PyObject *
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100246_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200247static PyObject *
248_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
249static PyObject *
250_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
251
252static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000253unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000254 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100255 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000256 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
257
Alexander Belopolsky40018472011-02-26 01:02:56 +0000258static void
259raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300260 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100261 PyObject *unicode,
262 Py_ssize_t startpos, Py_ssize_t endpos,
263 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000264
Christian Heimes190d79e2008-01-30 11:58:22 +0000265/* Same for linebreaks */
266static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000267 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000268/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000269/* 0x000B, * LINE TABULATION */
270/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000271/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000272 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000273 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000274/* 0x001C, * FILE SEPARATOR */
275/* 0x001D, * GROUP SEPARATOR */
276/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000277 0, 0, 0, 0, 1, 1, 1, 0,
278 0, 0, 0, 0, 0, 0, 0, 0,
279 0, 0, 0, 0, 0, 0, 0, 0,
280 0, 0, 0, 0, 0, 0, 0, 0,
281 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000282
Benjamin Peterson14339b62009-01-31 16:36:08 +0000283 0, 0, 0, 0, 0, 0, 0, 0,
284 0, 0, 0, 0, 0, 0, 0, 0,
285 0, 0, 0, 0, 0, 0, 0, 0,
286 0, 0, 0, 0, 0, 0, 0, 0,
287 0, 0, 0, 0, 0, 0, 0, 0,
288 0, 0, 0, 0, 0, 0, 0, 0,
289 0, 0, 0, 0, 0, 0, 0, 0,
290 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000291};
292
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300293/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
294 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000295Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000296PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000297{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000298#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000299 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000300#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000301 /* This is actually an illegal character, so it should
302 not be passed to unichr. */
303 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000304#endif
305}
306
Victor Stinner910337b2011-10-03 03:20:16 +0200307#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200308int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100309_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200310{
311 PyASCIIObject *ascii;
312 unsigned int kind;
313
314 assert(PyUnicode_Check(op));
315
316 ascii = (PyASCIIObject *)op;
317 kind = ascii->state.kind;
318
Victor Stinnera3b334d2011-10-03 13:53:37 +0200319 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200320 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200321 assert(ascii->state.ready == 1);
322 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200323 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200324 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200325 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200326
Victor Stinnera41463c2011-10-04 01:05:08 +0200327 if (ascii->state.compact == 1) {
328 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200329 assert(kind == PyUnicode_1BYTE_KIND
330 || kind == PyUnicode_2BYTE_KIND
331 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200332 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200333 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200334 assert (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100335 }
336 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200337 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
338
339 data = unicode->data.any;
340 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnere30c0a12011-11-04 20:54:05 +0100341 assert(ascii->length == 0);
342 assert(ascii->hash == -1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200343 assert(ascii->state.compact == 0);
344 assert(ascii->state.ascii == 0);
345 assert(ascii->state.ready == 0);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100346 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
Victor Stinnera41463c2011-10-04 01:05:08 +0200347 assert(ascii->wstr != NULL);
348 assert(data == NULL);
349 assert(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200350 }
351 else {
352 assert(kind == PyUnicode_1BYTE_KIND
353 || kind == PyUnicode_2BYTE_KIND
354 || kind == PyUnicode_4BYTE_KIND);
355 assert(ascii->state.compact == 0);
356 assert(ascii->state.ready == 1);
357 assert(data != NULL);
358 if (ascii->state.ascii) {
359 assert (compact->utf8 == data);
360 assert (compact->utf8_length == ascii->length);
361 }
362 else
363 assert (compact->utf8 != data);
364 }
365 }
366 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200367 if (
368#if SIZEOF_WCHAR_T == 2
369 kind == PyUnicode_2BYTE_KIND
370#else
371 kind == PyUnicode_4BYTE_KIND
372#endif
373 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200374 {
375 assert(ascii->wstr == data);
376 assert(compact->wstr_length == ascii->length);
377 } else
378 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200379 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200380
381 if (compact->utf8 == NULL)
382 assert(compact->utf8_length == 0);
383 if (ascii->wstr == NULL)
384 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200385 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200386 /* check that the best kind is used */
387 if (check_content && kind != PyUnicode_WCHAR_KIND)
388 {
389 Py_ssize_t i;
390 Py_UCS4 maxchar = 0;
Victor Stinner718fbf02012-04-26 00:39:37 +0200391 void *data;
392 Py_UCS4 ch;
393
394 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200395 for (i=0; i < ascii->length; i++)
396 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200397 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200398 if (ch > maxchar)
399 maxchar = ch;
400 }
401 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100402 if (ascii->state.ascii == 0) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200403 assert(maxchar >= 128);
Victor Stinner77faf692011-11-20 18:56:05 +0100404 assert(maxchar <= 255);
405 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200406 else
407 assert(maxchar < 128);
408 }
Victor Stinner77faf692011-11-20 18:56:05 +0100409 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200410 assert(maxchar >= 0x100);
Victor Stinner77faf692011-11-20 18:56:05 +0100411 assert(maxchar <= 0xFFFF);
412 }
413 else {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200414 assert(maxchar >= 0x10000);
Victor Stinner8faf8212011-12-08 22:14:11 +0100415 assert(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100416 }
Victor Stinner718fbf02012-04-26 00:39:37 +0200417 assert(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200418 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400419 return 1;
420}
Victor Stinner910337b2011-10-03 03:20:16 +0200421#endif
422
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100423static PyObject*
424unicode_result_wchar(PyObject *unicode)
425{
426#ifndef Py_DEBUG
427 Py_ssize_t len;
428
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100429 len = _PyUnicode_WSTR_LENGTH(unicode);
430 if (len == 0) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100431 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200432 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100433 }
434
435 if (len == 1) {
436 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100437 if ((Py_UCS4)ch < 256) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100438 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
439 Py_DECREF(unicode);
440 return latin1_char;
441 }
442 }
443
444 if (_PyUnicode_Ready(unicode) < 0) {
Victor Stinneraa771272012-10-04 02:32:58 +0200445 Py_DECREF(unicode);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100446 return NULL;
447 }
448#else
Victor Stinneraa771272012-10-04 02:32:58 +0200449 assert(Py_REFCNT(unicode) == 1);
450
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100451 /* don't make the result ready in debug mode to ensure that the caller
452 makes the string ready before using it */
453 assert(_PyUnicode_CheckConsistency(unicode, 1));
454#endif
455 return unicode;
456}
457
458static PyObject*
459unicode_result_ready(PyObject *unicode)
460{
461 Py_ssize_t length;
462
463 length = PyUnicode_GET_LENGTH(unicode);
464 if (length == 0) {
465 if (unicode != unicode_empty) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100466 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200467 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100468 }
469 return unicode_empty;
470 }
471
472 if (length == 1) {
Victor Stinner69ed0f42013-04-09 21:48:24 +0200473 void *data = PyUnicode_DATA(unicode);
474 int kind = PyUnicode_KIND(unicode);
475 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100476 if (ch < 256) {
477 PyObject *latin1_char = unicode_latin1[ch];
478 if (latin1_char != NULL) {
479 if (unicode != latin1_char) {
480 Py_INCREF(latin1_char);
481 Py_DECREF(unicode);
482 }
483 return latin1_char;
484 }
485 else {
486 assert(_PyUnicode_CheckConsistency(unicode, 1));
487 Py_INCREF(unicode);
488 unicode_latin1[ch] = unicode;
489 return unicode;
490 }
491 }
492 }
493
494 assert(_PyUnicode_CheckConsistency(unicode, 1));
495 return unicode;
496}
497
498static PyObject*
499unicode_result(PyObject *unicode)
500{
501 assert(_PyUnicode_CHECK(unicode));
502 if (PyUnicode_IS_READY(unicode))
503 return unicode_result_ready(unicode);
504 else
505 return unicode_result_wchar(unicode);
506}
507
Victor Stinnerc4b49542011-12-11 22:44:26 +0100508static PyObject*
509unicode_result_unchanged(PyObject *unicode)
510{
511 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500512 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100513 return NULL;
514 Py_INCREF(unicode);
515 return unicode;
516 }
517 else
518 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100519 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100520}
521
Victor Stinner3a50e702011-10-18 21:21:00 +0200522#ifdef HAVE_MBCS
523static OSVERSIONINFOEX winver;
524#endif
525
Thomas Wouters477c8d52006-05-27 19:21:47 +0000526/* --- Bloom Filters ----------------------------------------------------- */
527
528/* stuff to implement simple "bloom filters" for Unicode characters.
529 to keep things simple, we use a single bitmask, using the least 5
530 bits from each unicode characters as the bit index. */
531
532/* the linebreak mask is set up by Unicode_Init below */
533
Antoine Pitrouf068f942010-01-13 14:19:12 +0000534#if LONG_BIT >= 128
535#define BLOOM_WIDTH 128
536#elif LONG_BIT >= 64
537#define BLOOM_WIDTH 64
538#elif LONG_BIT >= 32
539#define BLOOM_WIDTH 32
540#else
541#error "LONG_BIT is smaller than 32"
542#endif
543
Thomas Wouters477c8d52006-05-27 19:21:47 +0000544#define BLOOM_MASK unsigned long
545
Serhiy Storchaka05997252013-01-26 12:14:02 +0200546static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000547
Antoine Pitrouf068f942010-01-13 14:19:12 +0000548#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000549
Benjamin Peterson29060642009-01-31 22:14:21 +0000550#define BLOOM_LINEBREAK(ch) \
551 ((ch) < 128U ? ascii_linebreak[(ch)] : \
552 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000553
Alexander Belopolsky40018472011-02-26 01:02:56 +0000554Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200555make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000556{
Victor Stinnera85af502013-04-09 21:53:54 +0200557#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
558 do { \
559 TYPE *data = (TYPE *)PTR; \
560 TYPE *end = data + LEN; \
561 Py_UCS4 ch; \
562 for (; data != end; data++) { \
563 ch = *data; \
564 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
565 } \
566 break; \
567 } while (0)
568
Thomas Wouters477c8d52006-05-27 19:21:47 +0000569 /* calculate simple bloom-style bitmask for a given unicode string */
570
Antoine Pitrouf068f942010-01-13 14:19:12 +0000571 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000572
573 mask = 0;
Victor Stinnera85af502013-04-09 21:53:54 +0200574 switch (kind) {
575 case PyUnicode_1BYTE_KIND:
576 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
577 break;
578 case PyUnicode_2BYTE_KIND:
579 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
580 break;
581 case PyUnicode_4BYTE_KIND:
582 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
583 break;
584 default:
585 assert(0);
586 }
Thomas Wouters477c8d52006-05-27 19:21:47 +0000587 return mask;
Victor Stinnera85af502013-04-09 21:53:54 +0200588
589#undef BLOOM_UPDATE
Thomas Wouters477c8d52006-05-27 19:21:47 +0000590}
591
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200592/* Compilation of templated routines */
593
594#include "stringlib/asciilib.h"
595#include "stringlib/fastsearch.h"
596#include "stringlib/partition.h"
597#include "stringlib/split.h"
598#include "stringlib/count.h"
599#include "stringlib/find.h"
600#include "stringlib/find_max_char.h"
601#include "stringlib/localeutil.h"
602#include "stringlib/undef.h"
603
604#include "stringlib/ucs1lib.h"
605#include "stringlib/fastsearch.h"
606#include "stringlib/partition.h"
607#include "stringlib/split.h"
608#include "stringlib/count.h"
609#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300610#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200611#include "stringlib/find_max_char.h"
612#include "stringlib/localeutil.h"
613#include "stringlib/undef.h"
614
615#include "stringlib/ucs2lib.h"
616#include "stringlib/fastsearch.h"
617#include "stringlib/partition.h"
618#include "stringlib/split.h"
619#include "stringlib/count.h"
620#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300621#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200622#include "stringlib/find_max_char.h"
623#include "stringlib/localeutil.h"
624#include "stringlib/undef.h"
625
626#include "stringlib/ucs4lib.h"
627#include "stringlib/fastsearch.h"
628#include "stringlib/partition.h"
629#include "stringlib/split.h"
630#include "stringlib/count.h"
631#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300632#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200633#include "stringlib/find_max_char.h"
634#include "stringlib/localeutil.h"
635#include "stringlib/undef.h"
636
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200637#include "stringlib/unicodedefs.h"
638#include "stringlib/fastsearch.h"
639#include "stringlib/count.h"
640#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100641#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200642
Guido van Rossumd57fd912000-03-10 22:53:23 +0000643/* --- Unicode Object ----------------------------------------------------- */
644
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200645static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200646fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200647
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200648Py_LOCAL_INLINE(Py_ssize_t) findchar(void *s, int kind,
649 Py_ssize_t size, Py_UCS4 ch,
650 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200651{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200652 int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH;
653
654 switch (kind) {
655 case PyUnicode_1BYTE_KIND:
656 {
657 Py_UCS1 ch1 = (Py_UCS1) ch;
658 if (ch1 == ch)
659 return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode);
660 else
661 return -1;
662 }
663 case PyUnicode_2BYTE_KIND:
664 {
665 Py_UCS2 ch2 = (Py_UCS2) ch;
666 if (ch2 == ch)
667 return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode);
668 else
669 return -1;
670 }
671 case PyUnicode_4BYTE_KIND:
672 return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode);
673 default:
674 assert(0);
675 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200676 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200677}
678
Victor Stinnerafffce42012-10-03 23:03:17 +0200679#ifdef Py_DEBUG
680/* Fill the data of an Unicode string with invalid characters to detect bugs
681 earlier.
682
683 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
684 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
685 invalid character in Unicode 6.0. */
686static void
687unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
688{
689 int kind = PyUnicode_KIND(unicode);
690 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
691 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
692 if (length <= old_length)
693 return;
694 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
695}
696#endif
697
Victor Stinnerfe226c02011-10-03 03:52:20 +0200698static PyObject*
699resize_compact(PyObject *unicode, Py_ssize_t length)
700{
701 Py_ssize_t char_size;
702 Py_ssize_t struct_size;
703 Py_ssize_t new_size;
704 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100705 PyObject *new_unicode;
Victor Stinnerafffce42012-10-03 23:03:17 +0200706#ifdef Py_DEBUG
707 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
708#endif
709
Victor Stinner79891572012-05-03 13:43:07 +0200710 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200711 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +0100712 assert(PyUnicode_IS_COMPACT(unicode));
713
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200714 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100715 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +0200716 struct_size = sizeof(PyASCIIObject);
717 else
718 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200719 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200720
Victor Stinnerfe226c02011-10-03 03:52:20 +0200721 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
722 PyErr_NoMemory();
723 return NULL;
724 }
725 new_size = (struct_size + (length + 1) * char_size);
726
Victor Stinner84def372011-12-11 20:04:56 +0100727 _Py_DEC_REFTOTAL;
728 _Py_ForgetReference(unicode);
729
730 new_unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
731 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100732 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200733 PyErr_NoMemory();
734 return NULL;
735 }
Victor Stinner84def372011-12-11 20:04:56 +0100736 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200737 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +0100738
Victor Stinnerfe226c02011-10-03 03:52:20 +0200739 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200740 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200741 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100742 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +0200743 _PyUnicode_WSTR_LENGTH(unicode) = length;
744 }
Victor Stinnerbbbac2e2013-02-07 23:12:46 +0100745 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
746 PyObject_DEL(_PyUnicode_WSTR(unicode));
747 _PyUnicode_WSTR(unicode) = NULL;
748 }
Victor Stinnerafffce42012-10-03 23:03:17 +0200749#ifdef Py_DEBUG
750 unicode_fill_invalid(unicode, old_length);
751#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200752 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
753 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +0200754 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200755 return unicode;
756}
757
Alexander Belopolsky40018472011-02-26 01:02:56 +0000758static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200759resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000760{
Victor Stinner95663112011-10-04 01:03:50 +0200761 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100762 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200763 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200764 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000765
Victor Stinnerfe226c02011-10-03 03:52:20 +0200766 if (PyUnicode_IS_READY(unicode)) {
767 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200768 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200769 void *data;
Victor Stinnerafffce42012-10-03 23:03:17 +0200770#ifdef Py_DEBUG
771 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
772#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200773
774 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200775 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200776 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
777 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200778
779 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
780 PyErr_NoMemory();
781 return -1;
782 }
783 new_size = (length + 1) * char_size;
784
Victor Stinner7a9105a2011-12-12 00:13:42 +0100785 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
786 {
787 PyObject_DEL(_PyUnicode_UTF8(unicode));
788 _PyUnicode_UTF8(unicode) = NULL;
789 _PyUnicode_UTF8_LENGTH(unicode) = 0;
790 }
791
Victor Stinnerfe226c02011-10-03 03:52:20 +0200792 data = (PyObject *)PyObject_REALLOC(data, new_size);
793 if (data == NULL) {
794 PyErr_NoMemory();
795 return -1;
796 }
797 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200798 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200799 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200800 _PyUnicode_WSTR_LENGTH(unicode) = length;
801 }
802 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200803 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200804 _PyUnicode_UTF8_LENGTH(unicode) = length;
805 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200806 _PyUnicode_LENGTH(unicode) = length;
807 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinnerafffce42012-10-03 23:03:17 +0200808#ifdef Py_DEBUG
809 unicode_fill_invalid(unicode, old_length);
810#endif
Victor Stinner95663112011-10-04 01:03:50 +0200811 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200812 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200813 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200814 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200815 }
Victor Stinner95663112011-10-04 01:03:50 +0200816 assert(_PyUnicode_WSTR(unicode) != NULL);
817
818 /* check for integer overflow */
819 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
820 PyErr_NoMemory();
821 return -1;
822 }
Victor Stinner7a9105a2011-12-12 00:13:42 +0100823 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +0200824 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +0100825 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +0200826 if (!wstr) {
827 PyErr_NoMemory();
828 return -1;
829 }
830 _PyUnicode_WSTR(unicode) = wstr;
831 _PyUnicode_WSTR(unicode)[length] = 0;
832 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200833 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000834 return 0;
835}
836
Victor Stinnerfe226c02011-10-03 03:52:20 +0200837static PyObject*
838resize_copy(PyObject *unicode, Py_ssize_t length)
839{
840 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100841 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200842 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100843
Benjamin Petersonbac79492012-01-14 13:34:47 -0500844 if (PyUnicode_READY(unicode) == -1)
Victor Stinner7a9105a2011-12-12 00:13:42 +0100845 return NULL;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200846
847 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
848 if (copy == NULL)
849 return NULL;
850
851 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +0200852 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200853 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200854 }
855 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200856 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100857
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200858 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200859 if (w == NULL)
860 return NULL;
861 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
862 copy_length = Py_MIN(copy_length, length);
Victor Stinnerc6cf1ba2012-10-23 02:54:47 +0200863 Py_MEMCPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
864 copy_length * sizeof(wchar_t));
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200865 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200866 }
867}
868
Guido van Rossumd57fd912000-03-10 22:53:23 +0000869/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000870 Ux0000 terminated; some code (e.g. new_identifier)
871 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000872
873 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000874 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000875
876*/
877
Alexander Belopolsky40018472011-02-26 01:02:56 +0000878static PyUnicodeObject *
879_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000880{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +0200881 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200882 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000883
Thomas Wouters477c8d52006-05-27 19:21:47 +0000884 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000885 if (length == 0 && unicode_empty != NULL) {
886 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200887 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000888 }
889
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000890 /* Ensure we won't overflow the size. */
891 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
892 return (PyUnicodeObject *)PyErr_NoMemory();
893 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200894 if (length < 0) {
895 PyErr_SetString(PyExc_SystemError,
896 "Negative size passed to _PyUnicode_New");
897 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000898 }
899
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200900 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
901 if (unicode == NULL)
902 return NULL;
903 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
Victor Stinner68b674c2013-10-29 19:31:43 +0100904
905 _PyUnicode_WSTR_LENGTH(unicode) = length;
906 _PyUnicode_HASH(unicode) = -1;
907 _PyUnicode_STATE(unicode).interned = 0;
908 _PyUnicode_STATE(unicode).kind = 0;
909 _PyUnicode_STATE(unicode).compact = 0;
910 _PyUnicode_STATE(unicode).ready = 0;
911 _PyUnicode_STATE(unicode).ascii = 0;
912 _PyUnicode_DATA_ANY(unicode) = NULL;
913 _PyUnicode_LENGTH(unicode) = 0;
914 _PyUnicode_UTF8(unicode) = NULL;
915 _PyUnicode_UTF8_LENGTH(unicode) = 0;
916
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200917 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
918 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100919 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +0000920 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100921 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000922 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200923
Jeremy Hyltond8082792003-09-16 19:41:39 +0000924 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000925 * the caller fails before initializing str -- unicode_resize()
926 * reads str[0], and the Keep-Alive optimization can keep memory
927 * allocated for str alive across a call to unicode_dealloc(unicode).
928 * We don't want unicode_resize to read uninitialized memory in
929 * that case.
930 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200931 _PyUnicode_WSTR(unicode)[0] = 0;
932 _PyUnicode_WSTR(unicode)[length] = 0;
Victor Stinner68b674c2013-10-29 19:31:43 +0100933
Victor Stinner7931d9a2011-11-04 00:22:48 +0100934 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000935 return unicode;
936}
937
Victor Stinnerf42dc442011-10-02 23:33:16 +0200938static const char*
939unicode_kind_name(PyObject *unicode)
940{
Victor Stinner42dfd712011-10-03 14:41:45 +0200941 /* don't check consistency: unicode_kind_name() is called from
942 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +0200943 if (!PyUnicode_IS_COMPACT(unicode))
944 {
945 if (!PyUnicode_IS_READY(unicode))
946 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -0600947 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200948 {
949 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200950 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200951 return "legacy ascii";
952 else
953 return "legacy latin1";
954 case PyUnicode_2BYTE_KIND:
955 return "legacy UCS2";
956 case PyUnicode_4BYTE_KIND:
957 return "legacy UCS4";
958 default:
959 return "<legacy invalid kind>";
960 }
961 }
962 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -0600963 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +0200964 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200965 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200966 return "ascii";
967 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200968 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200969 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200970 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200971 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200972 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200973 default:
974 return "<invalid compact kind>";
975 }
976}
977
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200978#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200979/* Functions wrapping macros for use in debugger */
980char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200981 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200982}
983
984void *_PyUnicode_compact_data(void *unicode) {
985 return _PyUnicode_COMPACT_DATA(unicode);
986}
987void *_PyUnicode_data(void *unicode){
988 printf("obj %p\n", unicode);
989 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
990 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
991 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
992 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
993 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
994 return PyUnicode_DATA(unicode);
995}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200996
997void
998_PyUnicode_Dump(PyObject *op)
999{
1000 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +02001001 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1002 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
1003 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +02001004
Victor Stinnera849a4b2011-10-03 12:12:11 +02001005 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +02001006 {
1007 if (ascii->state.ascii)
1008 data = (ascii + 1);
1009 else
1010 data = (compact + 1);
1011 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001012 else
1013 data = unicode->data.any;
Victor Stinner0d60e872011-10-23 19:47:19 +02001014 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
1015
Victor Stinnera849a4b2011-10-03 12:12:11 +02001016 if (ascii->wstr == data)
1017 printf("shared ");
1018 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +02001019
Victor Stinnera3b334d2011-10-03 13:53:37 +02001020 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnera849a4b2011-10-03 12:12:11 +02001021 printf(" (%zu), ", compact->wstr_length);
1022 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
1023 printf("shared ");
1024 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001025 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001026 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001027}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001028#endif
1029
1030PyObject *
1031PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1032{
1033 PyObject *obj;
1034 PyCompactUnicodeObject *unicode;
1035 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +02001036 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001037 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001038 Py_ssize_t char_size;
1039 Py_ssize_t struct_size;
1040
1041 /* Optimization for empty strings */
1042 if (size == 0 && unicode_empty != NULL) {
1043 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001044 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001045 }
1046
Victor Stinner9e9d6892011-10-04 01:02:02 +02001047 is_ascii = 0;
1048 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001049 struct_size = sizeof(PyCompactUnicodeObject);
1050 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +02001051 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001052 char_size = 1;
1053 is_ascii = 1;
1054 struct_size = sizeof(PyASCIIObject);
1055 }
1056 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001057 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001058 char_size = 1;
1059 }
1060 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001061 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001062 char_size = 2;
1063 if (sizeof(wchar_t) == 2)
1064 is_sharing = 1;
1065 }
1066 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001067 if (maxchar > MAX_UNICODE) {
1068 PyErr_SetString(PyExc_SystemError,
1069 "invalid maximum character passed to PyUnicode_New");
1070 return NULL;
1071 }
Victor Stinner8f825062012-04-27 13:55:39 +02001072 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001073 char_size = 4;
1074 if (sizeof(wchar_t) == 4)
1075 is_sharing = 1;
1076 }
1077
1078 /* Ensure we won't overflow the size. */
1079 if (size < 0) {
1080 PyErr_SetString(PyExc_SystemError,
1081 "Negative size passed to PyUnicode_New");
1082 return NULL;
1083 }
1084 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1085 return PyErr_NoMemory();
1086
1087 /* Duplicated allocation code from _PyObject_New() instead of a call to
1088 * PyObject_New() so we are able to allocate space for the object and
1089 * it's data buffer.
1090 */
1091 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1092 if (obj == NULL)
1093 return PyErr_NoMemory();
1094 obj = PyObject_INIT(obj, &PyUnicode_Type);
1095 if (obj == NULL)
1096 return NULL;
1097
1098 unicode = (PyCompactUnicodeObject *)obj;
1099 if (is_ascii)
1100 data = ((PyASCIIObject*)obj) + 1;
1101 else
1102 data = unicode + 1;
1103 _PyUnicode_LENGTH(unicode) = size;
1104 _PyUnicode_HASH(unicode) = -1;
1105 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001106 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001107 _PyUnicode_STATE(unicode).compact = 1;
1108 _PyUnicode_STATE(unicode).ready = 1;
1109 _PyUnicode_STATE(unicode).ascii = is_ascii;
1110 if (is_ascii) {
1111 ((char*)data)[size] = 0;
1112 _PyUnicode_WSTR(unicode) = NULL;
1113 }
Victor Stinner8f825062012-04-27 13:55:39 +02001114 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001115 ((char*)data)[size] = 0;
1116 _PyUnicode_WSTR(unicode) = NULL;
1117 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001118 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001119 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001120 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001121 else {
1122 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001123 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001124 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001125 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001126 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001127 ((Py_UCS4*)data)[size] = 0;
1128 if (is_sharing) {
1129 _PyUnicode_WSTR_LENGTH(unicode) = size;
1130 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1131 }
1132 else {
1133 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1134 _PyUnicode_WSTR(unicode) = NULL;
1135 }
1136 }
Victor Stinner8f825062012-04-27 13:55:39 +02001137#ifdef Py_DEBUG
Victor Stinnerafffce42012-10-03 23:03:17 +02001138 unicode_fill_invalid((PyObject*)unicode, 0);
Victor Stinner8f825062012-04-27 13:55:39 +02001139#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001140 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001141 return obj;
1142}
1143
1144#if SIZEOF_WCHAR_T == 2
1145/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1146 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001147 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001148
1149 This function assumes that unicode can hold one more code point than wstr
1150 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001151static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001152unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001153 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001154{
1155 const wchar_t *iter;
1156 Py_UCS4 *ucs4_out;
1157
Victor Stinner910337b2011-10-03 03:20:16 +02001158 assert(unicode != NULL);
1159 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001160 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1161 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1162
1163 for (iter = begin; iter < end; ) {
1164 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1165 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001166 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1167 && (iter+1) < end
1168 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001169 {
Victor Stinner551ac952011-11-29 22:58:13 +01001170 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001171 iter += 2;
1172 }
1173 else {
1174 *ucs4_out++ = *iter;
1175 iter++;
1176 }
1177 }
1178 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1179 _PyUnicode_GET_LENGTH(unicode)));
1180
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001181}
1182#endif
1183
Victor Stinnercd9950f2011-10-02 00:34:53 +02001184static int
Victor Stinner488fa492011-12-12 00:01:39 +01001185unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001186{
Victor Stinner488fa492011-12-12 00:01:39 +01001187 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001188 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001189 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001190 return -1;
1191 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001192 return 0;
1193}
1194
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001195static int
1196_copy_characters(PyObject *to, Py_ssize_t to_start,
1197 PyObject *from, Py_ssize_t from_start,
1198 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001199{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001200 unsigned int from_kind, to_kind;
1201 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001202
Victor Stinneree4544c2012-05-09 22:24:08 +02001203 assert(0 <= how_many);
1204 assert(0 <= from_start);
1205 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001206 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001207 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001208 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001209
Victor Stinnerd3f08822012-05-29 12:57:52 +02001210 assert(PyUnicode_Check(to));
1211 assert(PyUnicode_IS_READY(to));
1212 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1213
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001214 if (how_many == 0)
1215 return 0;
1216
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001217 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001218 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001219 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001220 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001221
Victor Stinnerf1852262012-06-16 16:38:26 +02001222#ifdef Py_DEBUG
1223 if (!check_maxchar
1224 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1225 {
1226 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1227 Py_UCS4 ch;
1228 Py_ssize_t i;
1229 for (i=0; i < how_many; i++) {
1230 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1231 assert(ch <= to_maxchar);
1232 }
1233 }
1234#endif
1235
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001236 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001237 if (check_maxchar
1238 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1239 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001240 /* Writing Latin-1 characters into an ASCII string requires to
1241 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001242 Py_UCS4 max_char;
1243 max_char = ucs1lib_find_max_char(from_data,
1244 (Py_UCS1*)from_data + how_many);
1245 if (max_char >= 128)
1246 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001247 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001248 Py_MEMCPY((char*)to_data + to_kind * to_start,
1249 (char*)from_data + from_kind * from_start,
1250 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001251 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001252 else if (from_kind == PyUnicode_1BYTE_KIND
1253 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001254 {
1255 _PyUnicode_CONVERT_BYTES(
1256 Py_UCS1, Py_UCS2,
1257 PyUnicode_1BYTE_DATA(from) + from_start,
1258 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1259 PyUnicode_2BYTE_DATA(to) + to_start
1260 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001261 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001262 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001263 && to_kind == PyUnicode_4BYTE_KIND)
1264 {
1265 _PyUnicode_CONVERT_BYTES(
1266 Py_UCS1, Py_UCS4,
1267 PyUnicode_1BYTE_DATA(from) + from_start,
1268 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1269 PyUnicode_4BYTE_DATA(to) + to_start
1270 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001271 }
1272 else if (from_kind == PyUnicode_2BYTE_KIND
1273 && to_kind == PyUnicode_4BYTE_KIND)
1274 {
1275 _PyUnicode_CONVERT_BYTES(
1276 Py_UCS2, Py_UCS4,
1277 PyUnicode_2BYTE_DATA(from) + from_start,
1278 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1279 PyUnicode_4BYTE_DATA(to) + to_start
1280 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001281 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001282 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001283 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1284
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001285 if (!check_maxchar) {
1286 if (from_kind == PyUnicode_2BYTE_KIND
1287 && to_kind == PyUnicode_1BYTE_KIND)
1288 {
1289 _PyUnicode_CONVERT_BYTES(
1290 Py_UCS2, Py_UCS1,
1291 PyUnicode_2BYTE_DATA(from) + from_start,
1292 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1293 PyUnicode_1BYTE_DATA(to) + to_start
1294 );
1295 }
1296 else if (from_kind == PyUnicode_4BYTE_KIND
1297 && to_kind == PyUnicode_1BYTE_KIND)
1298 {
1299 _PyUnicode_CONVERT_BYTES(
1300 Py_UCS4, Py_UCS1,
1301 PyUnicode_4BYTE_DATA(from) + from_start,
1302 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1303 PyUnicode_1BYTE_DATA(to) + to_start
1304 );
1305 }
1306 else if (from_kind == PyUnicode_4BYTE_KIND
1307 && to_kind == PyUnicode_2BYTE_KIND)
1308 {
1309 _PyUnicode_CONVERT_BYTES(
1310 Py_UCS4, Py_UCS2,
1311 PyUnicode_4BYTE_DATA(from) + from_start,
1312 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1313 PyUnicode_2BYTE_DATA(to) + to_start
1314 );
1315 }
1316 else {
1317 assert(0);
1318 return -1;
1319 }
1320 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001321 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001322 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001323 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001324 Py_ssize_t i;
1325
Victor Stinnera0702ab2011-09-29 14:14:38 +02001326 for (i=0; i < how_many; i++) {
1327 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001328 if (ch > to_maxchar)
1329 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001330 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1331 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001332 }
1333 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001334 return 0;
1335}
1336
Victor Stinnerd3f08822012-05-29 12:57:52 +02001337void
1338_PyUnicode_FastCopyCharacters(
1339 PyObject *to, Py_ssize_t to_start,
1340 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001341{
1342 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1343}
1344
1345Py_ssize_t
1346PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1347 PyObject *from, Py_ssize_t from_start,
1348 Py_ssize_t how_many)
1349{
1350 int err;
1351
1352 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1353 PyErr_BadInternalCall();
1354 return -1;
1355 }
1356
Benjamin Petersonbac79492012-01-14 13:34:47 -05001357 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001358 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001359 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001360 return -1;
1361
Victor Stinnerd3f08822012-05-29 12:57:52 +02001362 if (from_start < 0) {
1363 PyErr_SetString(PyExc_IndexError, "string index out of range");
1364 return -1;
1365 }
1366 if (to_start < 0) {
1367 PyErr_SetString(PyExc_IndexError, "string index out of range");
1368 return -1;
1369 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001370 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1371 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1372 PyErr_Format(PyExc_SystemError,
1373 "Cannot write %zi characters at %zi "
1374 "in a string of %zi characters",
1375 how_many, to_start, PyUnicode_GET_LENGTH(to));
1376 return -1;
1377 }
1378
1379 if (how_many == 0)
1380 return 0;
1381
Victor Stinner488fa492011-12-12 00:01:39 +01001382 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001383 return -1;
1384
1385 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1386 if (err) {
1387 PyErr_Format(PyExc_SystemError,
1388 "Cannot copy %s characters "
1389 "into a string of %s characters",
1390 unicode_kind_name(from),
1391 unicode_kind_name(to));
1392 return -1;
1393 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001394 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001395}
1396
Victor Stinner17222162011-09-28 22:15:37 +02001397/* Find the maximum code point and count the number of surrogate pairs so a
1398 correct string length can be computed before converting a string to UCS4.
1399 This function counts single surrogates as a character and not as a pair.
1400
1401 Return 0 on success, or -1 on error. */
1402static int
1403find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1404 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001405{
1406 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001407 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001408
Victor Stinnerc53be962011-10-02 21:33:54 +02001409 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001410 *num_surrogates = 0;
1411 *maxchar = 0;
1412
1413 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001414#if SIZEOF_WCHAR_T == 2
Victor Stinnercf77da92013-03-06 01:09:24 +01001415 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1416 && (iter+1) < end
1417 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1418 {
1419 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1420 ++(*num_surrogates);
1421 iter += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001422 }
1423 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001424#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001425 {
1426 ch = *iter;
1427 iter++;
1428 }
1429 if (ch > *maxchar) {
1430 *maxchar = ch;
1431 if (*maxchar > MAX_UNICODE) {
1432 PyErr_Format(PyExc_ValueError,
1433 "character U+%x is not in range [U+0000; U+10ffff]",
1434 ch);
1435 return -1;
1436 }
1437 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001438 }
1439 return 0;
1440}
1441
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001442int
1443_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001444{
1445 wchar_t *end;
1446 Py_UCS4 maxchar = 0;
1447 Py_ssize_t num_surrogates;
1448#if SIZEOF_WCHAR_T == 2
1449 Py_ssize_t length_wo_surrogates;
1450#endif
1451
Georg Brandl7597add2011-10-05 16:36:47 +02001452 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001453 strings were created using _PyObject_New() and where no canonical
1454 representation (the str field) has been set yet aka strings
1455 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001456 assert(_PyUnicode_CHECK(unicode));
1457 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001458 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001459 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001460 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001461 /* Actually, it should neither be interned nor be anything else: */
1462 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001463
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001464 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001465 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001466 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001467 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001468
1469 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001470 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1471 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001472 PyErr_NoMemory();
1473 return -1;
1474 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001475 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001476 _PyUnicode_WSTR(unicode), end,
1477 PyUnicode_1BYTE_DATA(unicode));
1478 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1479 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1480 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1481 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001482 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001483 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001484 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001485 }
1486 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001487 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001488 _PyUnicode_UTF8(unicode) = NULL;
1489 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001490 }
1491 PyObject_FREE(_PyUnicode_WSTR(unicode));
1492 _PyUnicode_WSTR(unicode) = NULL;
1493 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1494 }
1495 /* In this case we might have to convert down from 4-byte native
1496 wchar_t to 2-byte unicode. */
1497 else if (maxchar < 65536) {
1498 assert(num_surrogates == 0 &&
1499 "FindMaxCharAndNumSurrogatePairs() messed up");
1500
Victor Stinner506f5922011-09-28 22:34:18 +02001501#if SIZEOF_WCHAR_T == 2
1502 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001503 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001504 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1505 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1506 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001507 _PyUnicode_UTF8(unicode) = NULL;
1508 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001509#else
1510 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001511 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001512 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001513 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001514 PyErr_NoMemory();
1515 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001516 }
Victor Stinner506f5922011-09-28 22:34:18 +02001517 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1518 _PyUnicode_WSTR(unicode), end,
1519 PyUnicode_2BYTE_DATA(unicode));
1520 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1521 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1522 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001523 _PyUnicode_UTF8(unicode) = NULL;
1524 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001525 PyObject_FREE(_PyUnicode_WSTR(unicode));
1526 _PyUnicode_WSTR(unicode) = NULL;
1527 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1528#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001529 }
1530 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1531 else {
1532#if SIZEOF_WCHAR_T == 2
1533 /* in case the native representation is 2-bytes, we need to allocate a
1534 new normalized 4-byte version. */
1535 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001536 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1537 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001538 PyErr_NoMemory();
1539 return -1;
1540 }
1541 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1542 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001543 _PyUnicode_UTF8(unicode) = NULL;
1544 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001545 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1546 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001547 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001548 PyObject_FREE(_PyUnicode_WSTR(unicode));
1549 _PyUnicode_WSTR(unicode) = NULL;
1550 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1551#else
1552 assert(num_surrogates == 0);
1553
Victor Stinnerc3c74152011-10-02 20:39:55 +02001554 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001555 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001556 _PyUnicode_UTF8(unicode) = NULL;
1557 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001558 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1559#endif
1560 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1561 }
1562 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001563 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001564 return 0;
1565}
1566
Alexander Belopolsky40018472011-02-26 01:02:56 +00001567static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001568unicode_dealloc(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001569{
Walter Dörwald16807132007-05-25 13:52:07 +00001570 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001571 case SSTATE_NOT_INTERNED:
1572 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001573
Benjamin Peterson29060642009-01-31 22:14:21 +00001574 case SSTATE_INTERNED_MORTAL:
1575 /* revive dead object temporarily for DelItem */
1576 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001577 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001578 Py_FatalError(
1579 "deletion of interned string failed");
1580 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001581
Benjamin Peterson29060642009-01-31 22:14:21 +00001582 case SSTATE_INTERNED_IMMORTAL:
1583 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001584
Benjamin Peterson29060642009-01-31 22:14:21 +00001585 default:
1586 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001587 }
1588
Victor Stinner03490912011-10-03 23:45:12 +02001589 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001590 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001591 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001592 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001593 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1594 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001595
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001596 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001597}
1598
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001599#ifdef Py_DEBUG
1600static int
1601unicode_is_singleton(PyObject *unicode)
1602{
1603 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1604 if (unicode == unicode_empty)
1605 return 1;
1606 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1607 {
1608 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1609 if (ch < 256 && unicode_latin1[ch] == unicode)
1610 return 1;
1611 }
1612 return 0;
1613}
1614#endif
1615
Alexander Belopolsky40018472011-02-26 01:02:56 +00001616static int
Victor Stinner488fa492011-12-12 00:01:39 +01001617unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001618{
Victor Stinner488fa492011-12-12 00:01:39 +01001619 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001620 if (Py_REFCNT(unicode) != 1)
1621 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001622 if (_PyUnicode_HASH(unicode) != -1)
1623 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001624 if (PyUnicode_CHECK_INTERNED(unicode))
1625 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001626 if (!PyUnicode_CheckExact(unicode))
1627 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001628#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001629 /* singleton refcount is greater than 1 */
1630 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001631#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001632 return 1;
1633}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001634
Victor Stinnerfe226c02011-10-03 03:52:20 +02001635static int
1636unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1637{
1638 PyObject *unicode;
1639 Py_ssize_t old_length;
1640
1641 assert(p_unicode != NULL);
1642 unicode = *p_unicode;
1643
1644 assert(unicode != NULL);
1645 assert(PyUnicode_Check(unicode));
1646 assert(0 <= length);
1647
Victor Stinner910337b2011-10-03 03:20:16 +02001648 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001649 old_length = PyUnicode_WSTR_LENGTH(unicode);
1650 else
1651 old_length = PyUnicode_GET_LENGTH(unicode);
1652 if (old_length == length)
1653 return 0;
1654
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001655 if (length == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02001656 _Py_INCREF_UNICODE_EMPTY();
1657 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00001658 return -1;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001659 Py_DECREF(*p_unicode);
1660 *p_unicode = unicode_empty;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001661 return 0;
1662 }
1663
Victor Stinner488fa492011-12-12 00:01:39 +01001664 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001665 PyObject *copy = resize_copy(unicode, length);
1666 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001667 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001668 Py_DECREF(*p_unicode);
1669 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001670 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001671 }
1672
Victor Stinnerfe226c02011-10-03 03:52:20 +02001673 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001674 PyObject *new_unicode = resize_compact(unicode, length);
1675 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001676 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001677 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001678 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001679 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001680 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001681}
1682
Alexander Belopolsky40018472011-02-26 01:02:56 +00001683int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001684PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001685{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001686 PyObject *unicode;
1687 if (p_unicode == NULL) {
1688 PyErr_BadInternalCall();
1689 return -1;
1690 }
1691 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001692 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001693 {
1694 PyErr_BadInternalCall();
1695 return -1;
1696 }
1697 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001698}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001699
Victor Stinnerc5166102012-02-22 13:55:02 +01001700/* Copy a ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01001701
Victor Stinnerb429d3b2012-02-22 21:22:20 +01001702 WARNING: The function doesn't copy the terminating null character and
1703 doesn't check the maximum character (may write a latin1 character in an
1704 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02001705static void
1706unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1707 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01001708{
1709 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1710 void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02001711 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01001712
1713 switch (kind) {
1714 case PyUnicode_1BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01001715 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner8c6db452012-10-06 00:40:45 +02001716#ifdef Py_DEBUG
1717 if (PyUnicode_IS_ASCII(unicode)) {
1718 Py_UCS4 maxchar = ucs1lib_find_max_char(
1719 (const Py_UCS1*)str,
1720 (const Py_UCS1*)str + len);
1721 assert(maxchar < 128);
1722 }
1723#endif
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01001724 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02001725 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001726 }
1727 case PyUnicode_2BYTE_KIND: {
1728 Py_UCS2 *start = (Py_UCS2 *)data + index;
1729 Py_UCS2 *ucs2 = start;
1730 assert(index <= PyUnicode_GET_LENGTH(unicode));
1731
Victor Stinner184252a2012-06-16 02:57:41 +02001732 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001733 *ucs2 = (Py_UCS2)*str;
1734
1735 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02001736 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001737 }
1738 default: {
1739 Py_UCS4 *start = (Py_UCS4 *)data + index;
1740 Py_UCS4 *ucs4 = start;
1741 assert(kind == PyUnicode_4BYTE_KIND);
1742 assert(index <= PyUnicode_GET_LENGTH(unicode));
1743
Victor Stinner184252a2012-06-16 02:57:41 +02001744 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001745 *ucs4 = (Py_UCS4)*str;
1746
1747 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01001748 }
1749 }
1750}
1751
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001752static PyObject*
1753get_latin1_char(unsigned char ch)
1754{
Victor Stinnera464fc12011-10-02 20:39:30 +02001755 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001756 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001757 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001758 if (!unicode)
1759 return NULL;
1760 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001761 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001762 unicode_latin1[ch] = unicode;
1763 }
1764 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001765 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001766}
1767
Victor Stinner985a82a2014-01-03 12:53:47 +01001768static PyObject*
1769unicode_char(Py_UCS4 ch)
1770{
1771 PyObject *unicode;
1772
1773 assert(ch <= MAX_UNICODE);
1774
Victor Stinnerf3b46b42014-01-03 13:16:00 +01001775 if (ch < 256)
1776 return get_latin1_char(ch);
1777
Victor Stinner985a82a2014-01-03 12:53:47 +01001778 unicode = PyUnicode_New(1, ch);
1779 if (unicode == NULL)
1780 return NULL;
1781 switch (PyUnicode_KIND(unicode)) {
1782 case PyUnicode_1BYTE_KIND:
1783 PyUnicode_1BYTE_DATA(unicode)[0] = (Py_UCS1)ch;
1784 break;
1785 case PyUnicode_2BYTE_KIND:
1786 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
1787 break;
1788 default:
1789 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1790 PyUnicode_4BYTE_DATA(unicode)[0] = ch;
1791 }
1792 assert(_PyUnicode_CheckConsistency(unicode, 1));
1793 return unicode;
1794}
1795
Alexander Belopolsky40018472011-02-26 01:02:56 +00001796PyObject *
1797PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001798{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001799 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001800 Py_UCS4 maxchar = 0;
1801 Py_ssize_t num_surrogates;
1802
1803 if (u == NULL)
1804 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001805
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001806 /* If the Unicode data is known at construction time, we can apply
1807 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001808
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001809 /* Optimization for empty strings */
Serhiy Storchaka678db842013-01-26 12:16:36 +02001810 if (size == 0)
1811 _Py_RETURN_UNICODE_EMPTY();
Tim Petersced69f82003-09-16 20:30:58 +00001812
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001813 /* Single character Unicode objects in the Latin-1 range are
1814 shared when using this constructor */
Victor Stinnerd21b58c2013-02-26 00:15:54 +01001815 if (size == 1 && (Py_UCS4)*u < 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001816 return get_latin1_char((unsigned char)*u);
1817
1818 /* If not empty and not single character, copy the Unicode data
1819 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001820 if (find_maxchar_surrogates(u, u + size,
1821 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001822 return NULL;
1823
Victor Stinner8faf8212011-12-08 22:14:11 +01001824 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001825 if (!unicode)
1826 return NULL;
1827
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001828 switch (PyUnicode_KIND(unicode)) {
1829 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001830 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001831 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1832 break;
1833 case PyUnicode_2BYTE_KIND:
1834#if Py_UNICODE_SIZE == 2
1835 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1836#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001837 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001838 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1839#endif
1840 break;
1841 case PyUnicode_4BYTE_KIND:
1842#if SIZEOF_WCHAR_T == 2
1843 /* This is the only case which has to process surrogates, thus
1844 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001845 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001846#else
1847 assert(num_surrogates == 0);
1848 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1849#endif
1850 break;
1851 default:
1852 assert(0 && "Impossible state");
1853 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001854
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001855 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001856}
1857
Alexander Belopolsky40018472011-02-26 01:02:56 +00001858PyObject *
1859PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001860{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001861 if (size < 0) {
1862 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001863 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001864 return NULL;
1865 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001866 if (u != NULL)
1867 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
1868 else
1869 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001870}
1871
Alexander Belopolsky40018472011-02-26 01:02:56 +00001872PyObject *
1873PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001874{
1875 size_t size = strlen(u);
1876 if (size > PY_SSIZE_T_MAX) {
1877 PyErr_SetString(PyExc_OverflowError, "input too long");
1878 return NULL;
1879 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001880 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001881}
1882
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001883PyObject *
1884_PyUnicode_FromId(_Py_Identifier *id)
1885{
1886 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01001887 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
1888 strlen(id->string),
1889 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001890 if (!id->object)
1891 return NULL;
1892 PyUnicode_InternInPlace(&id->object);
1893 assert(!id->next);
1894 id->next = static_strings;
1895 static_strings = id;
1896 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001897 return id->object;
1898}
1899
1900void
1901_PyUnicode_ClearStaticStrings()
1902{
Benjamin Peterson0c270a82013-01-09 09:52:01 -06001903 _Py_Identifier *tmp, *s = static_strings;
1904 while (s) {
Serhiy Storchaka505ff752014-02-09 13:33:53 +02001905 Py_CLEAR(s->object);
Benjamin Peterson0c270a82013-01-09 09:52:01 -06001906 tmp = s->next;
1907 s->next = NULL;
1908 s = tmp;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001909 }
Benjamin Peterson0c270a82013-01-09 09:52:01 -06001910 static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001911}
1912
Benjamin Peterson0df54292012-03-26 14:50:32 -04001913/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001914
Victor Stinnerd3f08822012-05-29 12:57:52 +02001915PyObject*
1916_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02001917{
Victor Stinnerd3f08822012-05-29 12:57:52 +02001918 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01001919 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01001920 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02001921#ifdef Py_DEBUG
Victor Stinnerd21b58c2013-02-26 00:15:54 +01001922 assert((unsigned char)s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02001923#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001924 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01001925 }
Victor Stinner785938e2011-12-11 20:09:03 +01001926 unicode = PyUnicode_New(size, 127);
1927 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02001928 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01001929 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
1930 assert(_PyUnicode_CheckConsistency(unicode, 1));
1931 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02001932}
1933
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001934static Py_UCS4
1935kind_maxchar_limit(unsigned int kind)
1936{
Benjamin Petersonead6b532011-12-20 17:23:42 -06001937 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001938 case PyUnicode_1BYTE_KIND:
1939 return 0x80;
1940 case PyUnicode_2BYTE_KIND:
1941 return 0x100;
1942 case PyUnicode_4BYTE_KIND:
1943 return 0x10000;
1944 default:
1945 assert(0 && "invalid kind");
Victor Stinner8faf8212011-12-08 22:14:11 +01001946 return MAX_UNICODE;
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001947 }
1948}
1949
Victor Stinnere6abb482012-05-02 01:15:40 +02001950Py_LOCAL_INLINE(Py_UCS4)
1951align_maxchar(Py_UCS4 maxchar)
1952{
1953 if (maxchar <= 127)
1954 return 127;
1955 else if (maxchar <= 255)
1956 return 255;
1957 else if (maxchar <= 65535)
1958 return 65535;
1959 else
1960 return MAX_UNICODE;
1961}
1962
Victor Stinner702c7342011-10-05 13:50:52 +02001963static PyObject*
Victor Stinnerd21b58c2013-02-26 00:15:54 +01001964_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001965{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001966 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001967 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001968
Serhiy Storchaka678db842013-01-26 12:16:36 +02001969 if (size == 0)
1970 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001971 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001972 if (size == 1)
1973 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001974
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001975 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001976 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001977 if (!res)
1978 return NULL;
1979 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001980 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001981 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001982}
1983
Victor Stinnere57b1c02011-09-28 22:20:48 +02001984static PyObject*
1985_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001986{
1987 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001988 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001989
Serhiy Storchaka678db842013-01-26 12:16:36 +02001990 if (size == 0)
1991 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001992 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01001993 if (size == 1)
1994 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001995
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001996 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001997 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001998 if (!res)
1999 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002000 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002001 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002002 else {
2003 _PyUnicode_CONVERT_BYTES(
2004 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2005 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002006 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002007 return res;
2008}
2009
Victor Stinnere57b1c02011-09-28 22:20:48 +02002010static PyObject*
2011_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002012{
2013 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002014 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002015
Serhiy Storchaka678db842013-01-26 12:16:36 +02002016 if (size == 0)
2017 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002018 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002019 if (size == 1)
2020 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002021
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002022 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002023 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002024 if (!res)
2025 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002026 if (max_char < 256)
2027 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2028 PyUnicode_1BYTE_DATA(res));
2029 else if (max_char < 0x10000)
2030 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2031 PyUnicode_2BYTE_DATA(res));
2032 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002033 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002034 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002035 return res;
2036}
2037
2038PyObject*
2039PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2040{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002041 if (size < 0) {
2042 PyErr_SetString(PyExc_ValueError, "size must be positive");
2043 return NULL;
2044 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002045 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002046 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002047 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002048 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002049 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002050 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002051 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002052 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002053 PyErr_SetString(PyExc_SystemError, "invalid kind");
2054 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002055 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002056}
2057
Victor Stinnerece58de2012-04-23 23:36:38 +02002058Py_UCS4
2059_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2060{
2061 enum PyUnicode_Kind kind;
2062 void *startptr, *endptr;
2063
2064 assert(PyUnicode_IS_READY(unicode));
2065 assert(0 <= start);
2066 assert(end <= PyUnicode_GET_LENGTH(unicode));
2067 assert(start <= end);
2068
2069 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2070 return PyUnicode_MAX_CHAR_VALUE(unicode);
2071
2072 if (start == end)
2073 return 127;
2074
Victor Stinner94d558b2012-04-27 22:26:58 +02002075 if (PyUnicode_IS_ASCII(unicode))
2076 return 127;
2077
Victor Stinnerece58de2012-04-23 23:36:38 +02002078 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002079 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002080 endptr = (char *)startptr + end * kind;
2081 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002082 switch(kind) {
2083 case PyUnicode_1BYTE_KIND:
2084 return ucs1lib_find_max_char(startptr, endptr);
2085 case PyUnicode_2BYTE_KIND:
2086 return ucs2lib_find_max_char(startptr, endptr);
2087 case PyUnicode_4BYTE_KIND:
2088 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002089 default:
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002090 assert(0);
2091 return 0;
Victor Stinnerece58de2012-04-23 23:36:38 +02002092 }
2093}
2094
Victor Stinner25a4b292011-10-06 12:31:55 +02002095/* Ensure that a string uses the most efficient storage, if it is not the
2096 case: create a new string with of the right kind. Write NULL into *p_unicode
2097 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002098static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002099unicode_adjust_maxchar(PyObject **p_unicode)
2100{
2101 PyObject *unicode, *copy;
2102 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002103 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002104 unsigned int kind;
2105
2106 assert(p_unicode != NULL);
2107 unicode = *p_unicode;
2108 assert(PyUnicode_IS_READY(unicode));
2109 if (PyUnicode_IS_ASCII(unicode))
2110 return;
2111
2112 len = PyUnicode_GET_LENGTH(unicode);
2113 kind = PyUnicode_KIND(unicode);
2114 if (kind == PyUnicode_1BYTE_KIND) {
2115 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002116 max_char = ucs1lib_find_max_char(u, u + len);
2117 if (max_char >= 128)
2118 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002119 }
2120 else if (kind == PyUnicode_2BYTE_KIND) {
2121 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002122 max_char = ucs2lib_find_max_char(u, u + len);
2123 if (max_char >= 256)
2124 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002125 }
2126 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002127 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02002128 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002129 max_char = ucs4lib_find_max_char(u, u + len);
2130 if (max_char >= 0x10000)
2131 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002132 }
Victor Stinner25a4b292011-10-06 12:31:55 +02002133 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002134 if (copy != NULL)
2135 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002136 Py_DECREF(unicode);
2137 *p_unicode = copy;
2138}
2139
Victor Stinner034f6cf2011-09-30 02:26:44 +02002140PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002141_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002142{
Victor Stinner87af4f22011-11-21 23:03:47 +01002143 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002144 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002145
Victor Stinner034f6cf2011-09-30 02:26:44 +02002146 if (!PyUnicode_Check(unicode)) {
2147 PyErr_BadInternalCall();
2148 return NULL;
2149 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002150 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002151 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002152
Victor Stinner87af4f22011-11-21 23:03:47 +01002153 length = PyUnicode_GET_LENGTH(unicode);
2154 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002155 if (!copy)
2156 return NULL;
2157 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2158
Victor Stinner87af4f22011-11-21 23:03:47 +01002159 Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2160 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002161 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002162 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002163}
2164
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002165
Victor Stinnerbc603d12011-10-02 01:00:40 +02002166/* Widen Unicode objects to larger buffers. Don't write terminating null
2167 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002168
2169void*
2170_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2171{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002172 Py_ssize_t len;
2173 void *result;
2174 unsigned int skind;
2175
Benjamin Petersonbac79492012-01-14 13:34:47 -05002176 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02002177 return NULL;
2178
2179 len = PyUnicode_GET_LENGTH(s);
2180 skind = PyUnicode_KIND(s);
2181 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002182 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002183 return NULL;
2184 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002185 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002186 case PyUnicode_2BYTE_KIND:
2187 result = PyMem_Malloc(len * sizeof(Py_UCS2));
2188 if (!result)
2189 return PyErr_NoMemory();
2190 assert(skind == PyUnicode_1BYTE_KIND);
2191 _PyUnicode_CONVERT_BYTES(
2192 Py_UCS1, Py_UCS2,
2193 PyUnicode_1BYTE_DATA(s),
2194 PyUnicode_1BYTE_DATA(s) + len,
2195 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002196 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002197 case PyUnicode_4BYTE_KIND:
2198 result = PyMem_Malloc(len * sizeof(Py_UCS4));
2199 if (!result)
2200 return PyErr_NoMemory();
2201 if (skind == PyUnicode_2BYTE_KIND) {
2202 _PyUnicode_CONVERT_BYTES(
2203 Py_UCS2, Py_UCS4,
2204 PyUnicode_2BYTE_DATA(s),
2205 PyUnicode_2BYTE_DATA(s) + len,
2206 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002207 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002208 else {
2209 assert(skind == PyUnicode_1BYTE_KIND);
2210 _PyUnicode_CONVERT_BYTES(
2211 Py_UCS1, Py_UCS4,
2212 PyUnicode_1BYTE_DATA(s),
2213 PyUnicode_1BYTE_DATA(s) + len,
2214 result);
2215 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002216 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002217 default:
2218 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002219 }
Victor Stinner01698042011-10-04 00:04:26 +02002220 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002221 return NULL;
2222}
2223
2224static Py_UCS4*
2225as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2226 int copy_null)
2227{
2228 int kind;
2229 void *data;
2230 Py_ssize_t len, targetlen;
2231 if (PyUnicode_READY(string) == -1)
2232 return NULL;
2233 kind = PyUnicode_KIND(string);
2234 data = PyUnicode_DATA(string);
2235 len = PyUnicode_GET_LENGTH(string);
2236 targetlen = len;
2237 if (copy_null)
2238 targetlen++;
2239 if (!target) {
2240 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
2241 PyErr_NoMemory();
2242 return NULL;
2243 }
2244 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
2245 if (!target) {
2246 PyErr_NoMemory();
2247 return NULL;
2248 }
2249 }
2250 else {
2251 if (targetsize < targetlen) {
2252 PyErr_Format(PyExc_SystemError,
2253 "string is longer than the buffer");
2254 if (copy_null && 0 < targetsize)
2255 target[0] = 0;
2256 return NULL;
2257 }
2258 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002259 if (kind == PyUnicode_1BYTE_KIND) {
2260 Py_UCS1 *start = (Py_UCS1 *) data;
2261 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002262 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002263 else if (kind == PyUnicode_2BYTE_KIND) {
2264 Py_UCS2 *start = (Py_UCS2 *) data;
2265 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2266 }
2267 else {
2268 assert(kind == PyUnicode_4BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002269 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002270 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002271 if (copy_null)
2272 target[len] = 0;
2273 return target;
2274}
2275
2276Py_UCS4*
2277PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2278 int copy_null)
2279{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002280 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002281 PyErr_BadInternalCall();
2282 return NULL;
2283 }
2284 return as_ucs4(string, target, targetsize, copy_null);
2285}
2286
2287Py_UCS4*
2288PyUnicode_AsUCS4Copy(PyObject *string)
2289{
2290 return as_ucs4(string, NULL, 0, 1);
2291}
2292
2293#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00002294
Alexander Belopolsky40018472011-02-26 01:02:56 +00002295PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02002296PyUnicode_FromWideChar(const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002297{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002298 if (w == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002299 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02002300 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson29060642009-01-31 22:14:21 +00002301 PyErr_BadInternalCall();
2302 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002303 }
2304
Martin v. Löwis790465f2008-04-05 20:41:37 +00002305 if (size == -1) {
2306 size = wcslen(w);
2307 }
2308
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002309 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002310}
2311
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002312#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00002313
Walter Dörwald346737f2007-05-31 10:44:43 +00002314static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002315makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
Victor Stinnere215d962012-10-06 23:03:36 +02002316 char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00002317{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002318 *fmt++ = '%';
Benjamin Peterson14339b62009-01-31 16:36:08 +00002319 if (longflag)
2320 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002321 else if (longlongflag) {
2322 /* longlongflag should only ever be nonzero on machines with
2323 HAVE_LONG_LONG defined */
2324#ifdef HAVE_LONG_LONG
2325 char *f = PY_FORMAT_LONG_LONG;
2326 while (*f)
2327 *fmt++ = *f++;
2328#else
2329 /* we shouldn't ever get here */
2330 assert(0);
2331 *fmt++ = 'l';
2332#endif
2333 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002334 else if (size_tflag) {
2335 char *f = PY_FORMAT_SIZE_T;
2336 while (*f)
2337 *fmt++ = *f++;
2338 }
2339 *fmt++ = c;
2340 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00002341}
2342
Victor Stinner15a11362012-10-06 23:48:20 +02002343/* maximum number of characters required for output of %lld or %p.
Victor Stinnere215d962012-10-06 23:03:36 +02002344 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2345 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2346#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
Victor Stinner96865452011-03-01 23:44:09 +00002347
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002348static int
2349unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2350 Py_ssize_t width, Py_ssize_t precision)
2351{
2352 Py_ssize_t length, fill, arglen;
2353 Py_UCS4 maxchar;
2354
2355 if (PyUnicode_READY(str) == -1)
2356 return -1;
2357
2358 length = PyUnicode_GET_LENGTH(str);
2359 if ((precision == -1 || precision >= length)
2360 && width <= length)
2361 return _PyUnicodeWriter_WriteStr(writer, str);
2362
2363 if (precision != -1)
2364 length = Py_MIN(precision, length);
2365
2366 arglen = Py_MAX(length, width);
2367 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2368 maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2369 else
2370 maxchar = writer->maxchar;
2371
2372 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2373 return -1;
2374
2375 if (width > length) {
2376 fill = width - length;
2377 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2378 return -1;
2379 writer->pos += fill;
2380 }
2381
2382 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2383 str, 0, length);
2384 writer->pos += length;
2385 return 0;
2386}
2387
2388static int
2389unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
2390 Py_ssize_t width, Py_ssize_t precision)
2391{
2392 /* UTF-8 */
2393 Py_ssize_t length;
2394 PyObject *unicode;
2395 int res;
2396
2397 length = strlen(str);
2398 if (precision != -1)
2399 length = Py_MIN(length, precision);
2400 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2401 if (unicode == NULL)
2402 return -1;
2403
2404 res = unicode_fromformat_write_str(writer, unicode, width, -1);
2405 Py_DECREF(unicode);
2406 return res;
2407}
2408
Victor Stinner96865452011-03-01 23:44:09 +00002409static const char*
Victor Stinnere215d962012-10-06 23:03:36 +02002410unicode_fromformat_arg(_PyUnicodeWriter *writer,
2411 const char *f, va_list *vargs)
Victor Stinner96865452011-03-01 23:44:09 +00002412{
Victor Stinnere215d962012-10-06 23:03:36 +02002413 const char *p;
2414 Py_ssize_t len;
2415 int zeropad;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002416 Py_ssize_t width;
2417 Py_ssize_t precision;
Victor Stinnere215d962012-10-06 23:03:36 +02002418 int longflag;
2419 int longlongflag;
2420 int size_tflag;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002421 Py_ssize_t fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002422
2423 p = f;
2424 f++;
Victor Stinner4c63a972012-10-06 23:55:33 +02002425 zeropad = 0;
2426 if (*f == '0') {
2427 zeropad = 1;
2428 f++;
2429 }
Victor Stinner96865452011-03-01 23:44:09 +00002430
2431 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002432 width = -1;
2433 if (Py_ISDIGIT((unsigned)*f)) {
2434 width = *f - '0';
Victor Stinner96865452011-03-01 23:44:09 +00002435 f++;
Victor Stinnere215d962012-10-06 23:03:36 +02002436 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002437 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
Victor Stinner3921e902012-10-06 23:05:00 +02002438 PyErr_SetString(PyExc_ValueError,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002439 "width too big");
Victor Stinner3921e902012-10-06 23:05:00 +02002440 return NULL;
2441 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002442 width = (width * 10) + (*f - '0');
Victor Stinnere215d962012-10-06 23:03:36 +02002443 f++;
2444 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002445 }
2446 precision = -1;
2447 if (*f == '.') {
2448 f++;
2449 if (Py_ISDIGIT((unsigned)*f)) {
2450 precision = (*f - '0');
2451 f++;
2452 while (Py_ISDIGIT((unsigned)*f)) {
2453 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2454 PyErr_SetString(PyExc_ValueError,
2455 "precision too big");
2456 return NULL;
2457 }
2458 precision = (precision * 10) + (*f - '0');
2459 f++;
2460 }
2461 }
Victor Stinner96865452011-03-01 23:44:09 +00002462 if (*f == '%') {
2463 /* "%.3%s" => f points to "3" */
2464 f--;
2465 }
2466 }
2467 if (*f == '\0') {
Victor Stinnere215d962012-10-06 23:03:36 +02002468 /* bogus format "%.123" => go backward, f points to "3" */
Victor Stinner96865452011-03-01 23:44:09 +00002469 f--;
2470 }
Victor Stinner96865452011-03-01 23:44:09 +00002471
2472 /* Handle %ld, %lu, %lld and %llu. */
2473 longflag = 0;
2474 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002475 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002476 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002477 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002478 longflag = 1;
2479 ++f;
2480 }
2481#ifdef HAVE_LONG_LONG
2482 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002483 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002484 longlongflag = 1;
2485 f += 2;
2486 }
2487#endif
2488 }
2489 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002490 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002491 size_tflag = 1;
2492 ++f;
2493 }
Victor Stinnere215d962012-10-06 23:03:36 +02002494
2495 if (f[1] == '\0')
2496 writer->overallocate = 0;
2497
2498 switch (*f) {
2499 case 'c':
2500 {
2501 int ordinal = va_arg(*vargs, int);
Victor Stinnerff5a8482012-10-06 23:05:45 +02002502 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Serhiy Storchakac89533f2013-06-23 20:21:16 +03002503 PyErr_SetString(PyExc_OverflowError,
Victor Stinnerff5a8482012-10-06 23:05:45 +02002504 "character argument not in range(0x110000)");
2505 return NULL;
2506 }
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002507 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002508 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002509 break;
2510 }
2511
2512 case 'i':
2513 case 'd':
2514 case 'u':
2515 case 'x':
2516 {
2517 /* used by sprintf */
2518 char fmt[10]; /* should be enough for "%0lld\0" */
Victor Stinner15a11362012-10-06 23:48:20 +02002519 char buffer[MAX_LONG_LONG_CHARS];
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002520 Py_ssize_t arglen;
Victor Stinnere215d962012-10-06 23:03:36 +02002521
2522 if (*f == 'u') {
2523 makefmt(fmt, longflag, longlongflag, size_tflag, *f);
2524
2525 if (longflag)
2526 len = sprintf(buffer, fmt,
2527 va_arg(*vargs, unsigned long));
2528#ifdef HAVE_LONG_LONG
2529 else if (longlongflag)
2530 len = sprintf(buffer, fmt,
2531 va_arg(*vargs, unsigned PY_LONG_LONG));
2532#endif
2533 else if (size_tflag)
2534 len = sprintf(buffer, fmt,
2535 va_arg(*vargs, size_t));
2536 else
2537 len = sprintf(buffer, fmt,
2538 va_arg(*vargs, unsigned int));
2539 }
2540 else if (*f == 'x') {
2541 makefmt(fmt, 0, 0, 0, 'x');
2542 len = sprintf(buffer, fmt, va_arg(*vargs, int));
2543 }
2544 else {
2545 makefmt(fmt, longflag, longlongflag, size_tflag, *f);
2546
2547 if (longflag)
2548 len = sprintf(buffer, fmt,
2549 va_arg(*vargs, long));
2550#ifdef HAVE_LONG_LONG
2551 else if (longlongflag)
2552 len = sprintf(buffer, fmt,
2553 va_arg(*vargs, PY_LONG_LONG));
2554#endif
2555 else if (size_tflag)
2556 len = sprintf(buffer, fmt,
2557 va_arg(*vargs, Py_ssize_t));
2558 else
2559 len = sprintf(buffer, fmt,
2560 va_arg(*vargs, int));
2561 }
2562 assert(len >= 0);
2563
Victor Stinnere215d962012-10-06 23:03:36 +02002564 if (precision < len)
2565 precision = len;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002566
2567 arglen = Py_MAX(precision, width);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002568 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2569 return NULL;
2570
Victor Stinnere215d962012-10-06 23:03:36 +02002571 if (width > precision) {
2572 Py_UCS4 fillchar;
2573 fill = width - precision;
2574 fillchar = zeropad?'0':' ';
Victor Stinner15a11362012-10-06 23:48:20 +02002575 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2576 return NULL;
2577 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002578 }
Victor Stinner15a11362012-10-06 23:48:20 +02002579 if (precision > len) {
Victor Stinnere215d962012-10-06 23:03:36 +02002580 fill = precision - len;
Victor Stinner15a11362012-10-06 23:48:20 +02002581 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2582 return NULL;
2583 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002584 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002585
Victor Stinner4a587072013-11-19 12:54:53 +01002586 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2587 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002588 break;
2589 }
2590
2591 case 'p':
2592 {
2593 char number[MAX_LONG_LONG_CHARS];
2594
2595 len = sprintf(number, "%p", va_arg(*vargs, void*));
2596 assert(len >= 0);
2597
2598 /* %p is ill-defined: ensure leading 0x. */
2599 if (number[1] == 'X')
2600 number[1] = 'x';
2601 else if (number[1] != 'x') {
2602 memmove(number + 2, number,
2603 strlen(number) + 1);
2604 number[0] = '0';
2605 number[1] = 'x';
2606 len += 2;
2607 }
2608
Victor Stinner4a587072013-11-19 12:54:53 +01002609 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002610 return NULL;
2611 break;
2612 }
2613
2614 case 's':
2615 {
2616 /* UTF-8 */
2617 const char *s = va_arg(*vargs, const char*);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002618 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002619 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002620 break;
2621 }
2622
2623 case 'U':
2624 {
2625 PyObject *obj = va_arg(*vargs, PyObject *);
2626 assert(obj && _PyUnicode_CHECK(obj));
2627
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002628 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002629 return NULL;
2630 break;
2631 }
2632
2633 case 'V':
2634 {
2635 PyObject *obj = va_arg(*vargs, PyObject *);
2636 const char *str = va_arg(*vargs, const char *);
Victor Stinnere215d962012-10-06 23:03:36 +02002637 if (obj) {
2638 assert(_PyUnicode_CHECK(obj));
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002639 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002640 return NULL;
2641 }
2642 else {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002643 assert(str != NULL);
2644 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002645 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002646 }
2647 break;
2648 }
2649
2650 case 'S':
2651 {
2652 PyObject *obj = va_arg(*vargs, PyObject *);
2653 PyObject *str;
2654 assert(obj);
2655 str = PyObject_Str(obj);
2656 if (!str)
2657 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002658 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002659 Py_DECREF(str);
2660 return NULL;
2661 }
2662 Py_DECREF(str);
2663 break;
2664 }
2665
2666 case 'R':
2667 {
2668 PyObject *obj = va_arg(*vargs, PyObject *);
2669 PyObject *repr;
2670 assert(obj);
2671 repr = PyObject_Repr(obj);
2672 if (!repr)
2673 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002674 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002675 Py_DECREF(repr);
2676 return NULL;
2677 }
2678 Py_DECREF(repr);
2679 break;
2680 }
2681
2682 case 'A':
2683 {
2684 PyObject *obj = va_arg(*vargs, PyObject *);
2685 PyObject *ascii;
2686 assert(obj);
2687 ascii = PyObject_ASCII(obj);
2688 if (!ascii)
2689 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002690 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002691 Py_DECREF(ascii);
2692 return NULL;
2693 }
2694 Py_DECREF(ascii);
2695 break;
2696 }
2697
2698 case '%':
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002699 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002700 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002701 break;
2702
2703 default:
2704 /* if we stumble upon an unknown formatting code, copy the rest
2705 of the format string to the output string. (we cannot just
2706 skip the code, since there's no way to know what's in the
2707 argument list) */
2708 len = strlen(p);
Victor Stinner4a587072013-11-19 12:54:53 +01002709 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002710 return NULL;
2711 f = p+len;
2712 return f;
2713 }
2714
2715 f++;
Victor Stinner96865452011-03-01 23:44:09 +00002716 return f;
2717}
2718
Walter Dörwaldd2034312007-05-18 16:29:38 +00002719PyObject *
2720PyUnicode_FromFormatV(const char *format, va_list vargs)
2721{
Victor Stinnere215d962012-10-06 23:03:36 +02002722 va_list vargs2;
2723 const char *f;
2724 _PyUnicodeWriter writer;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002725
Victor Stinner8f674cc2013-04-17 23:02:17 +02002726 _PyUnicodeWriter_Init(&writer);
2727 writer.min_length = strlen(format) + 100;
2728 writer.overallocate = 1;
Victor Stinnere215d962012-10-06 23:03:36 +02002729
2730 /* va_list may be an array (of 1 item) on some platforms (ex: AMD64).
2731 Copy it to be able to pass a reference to a subfunction. */
2732 Py_VA_COPY(vargs2, vargs);
2733
2734 for (f = format; *f; ) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002735 if (*f == '%') {
Victor Stinnere215d962012-10-06 23:03:36 +02002736 f = unicode_fromformat_arg(&writer, f, &vargs2);
2737 if (f == NULL)
2738 goto fail;
Victor Stinner1205f272010-09-11 00:54:47 +00002739 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002740 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002741 const char *p;
2742 Py_ssize_t len;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002743
Victor Stinnere215d962012-10-06 23:03:36 +02002744 p = f;
2745 do
2746 {
2747 if ((unsigned char)*p > 127) {
2748 PyErr_Format(PyExc_ValueError,
2749 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2750 "string, got a non-ASCII byte: 0x%02x",
2751 (unsigned char)*p);
2752 return NULL;
2753 }
2754 p++;
2755 }
2756 while (*p != '\0' && *p != '%');
2757 len = p - f;
2758
2759 if (*p == '\0')
2760 writer.overallocate = 0;
Victor Stinner4a587072013-11-19 12:54:53 +01002761
2762 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002763 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02002764
2765 f = p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002766 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002767 }
Victor Stinnere215d962012-10-06 23:03:36 +02002768 return _PyUnicodeWriter_Finish(&writer);
2769
2770 fail:
2771 _PyUnicodeWriter_Dealloc(&writer);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002772 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002773}
2774
Walter Dörwaldd2034312007-05-18 16:29:38 +00002775PyObject *
2776PyUnicode_FromFormat(const char *format, ...)
2777{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002778 PyObject* ret;
2779 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002780
2781#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002782 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002783#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002784 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002785#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002786 ret = PyUnicode_FromFormatV(format, vargs);
2787 va_end(vargs);
2788 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002789}
2790
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002791#ifdef HAVE_WCHAR_H
2792
Victor Stinner5593d8a2010-10-02 11:11:27 +00002793/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2794 convert a Unicode object to a wide character string.
2795
Victor Stinnerd88d9832011-09-06 02:00:05 +02002796 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002797 character) required to convert the unicode object. Ignore size argument.
2798
Victor Stinnerd88d9832011-09-06 02:00:05 +02002799 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002800 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002801 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002802static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002803unicode_aswidechar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002804 wchar_t *w,
2805 Py_ssize_t size)
2806{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002807 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002808 const wchar_t *wstr;
2809
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002810 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002811 if (wstr == NULL)
2812 return -1;
2813
Victor Stinner5593d8a2010-10-02 11:11:27 +00002814 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002815 if (size > res)
2816 size = res + 1;
2817 else
2818 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002819 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002820 return res;
2821 }
2822 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002823 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002824}
2825
2826Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002827PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002828 wchar_t *w,
2829 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002830{
2831 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002832 PyErr_BadInternalCall();
2833 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002834 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002835 return unicode_aswidechar(unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002836}
2837
Victor Stinner137c34c2010-09-29 10:25:54 +00002838wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002839PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002840 Py_ssize_t *size)
2841{
2842 wchar_t* buffer;
2843 Py_ssize_t buflen;
2844
2845 if (unicode == NULL) {
2846 PyErr_BadInternalCall();
2847 return NULL;
2848 }
2849
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002850 buflen = unicode_aswidechar(unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002851 if (buflen == -1)
2852 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002853 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002854 PyErr_NoMemory();
2855 return NULL;
2856 }
2857
Victor Stinner137c34c2010-09-29 10:25:54 +00002858 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2859 if (buffer == NULL) {
2860 PyErr_NoMemory();
2861 return NULL;
2862 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002863 buflen = unicode_aswidechar(unicode, buffer, buflen);
Stefan Krah8528c312012-08-19 21:52:43 +02002864 if (buflen == -1) {
2865 PyMem_FREE(buffer);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002866 return NULL;
Stefan Krah8528c312012-08-19 21:52:43 +02002867 }
Victor Stinner5593d8a2010-10-02 11:11:27 +00002868 if (size != NULL)
2869 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002870 return buffer;
2871}
2872
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002873#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002874
Alexander Belopolsky40018472011-02-26 01:02:56 +00002875PyObject *
2876PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002877{
Victor Stinner8faf8212011-12-08 22:14:11 +01002878 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002879 PyErr_SetString(PyExc_ValueError,
2880 "chr() arg not in range(0x110000)");
2881 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002882 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002883
Victor Stinner985a82a2014-01-03 12:53:47 +01002884 return unicode_char((Py_UCS4)ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002885}
2886
Alexander Belopolsky40018472011-02-26 01:02:56 +00002887PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02002888PyUnicode_FromObject(PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002889{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002890 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002891 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002892 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05002893 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002894 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002895 Py_INCREF(obj);
2896 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002897 }
2898 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002899 /* For a Unicode subtype that's not a Unicode object,
2900 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002901 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002902 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002903 PyErr_Format(PyExc_TypeError,
2904 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002905 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002906 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002907}
2908
Alexander Belopolsky40018472011-02-26 01:02:56 +00002909PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02002910PyUnicode_FromEncodedObject(PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002911 const char *encoding,
2912 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002913{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002914 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002915 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002916
Guido van Rossumd57fd912000-03-10 22:53:23 +00002917 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002918 PyErr_BadInternalCall();
2919 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002920 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002921
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002922 /* Decoding bytes objects is the most common case and should be fast */
2923 if (PyBytes_Check(obj)) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02002924 if (PyBytes_GET_SIZE(obj) == 0)
2925 _Py_RETURN_UNICODE_EMPTY();
2926 v = PyUnicode_Decode(
2927 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2928 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002929 return v;
2930 }
2931
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002932 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002933 PyErr_SetString(PyExc_TypeError,
2934 "decoding str is not supported");
2935 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002936 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002937
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002938 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2939 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2940 PyErr_Format(PyExc_TypeError,
2941 "coercing to str: need bytes, bytearray "
2942 "or buffer-like object, %.80s found",
2943 Py_TYPE(obj)->tp_name);
2944 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002945 }
Tim Petersced69f82003-09-16 20:30:58 +00002946
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002947 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02002948 PyBuffer_Release(&buffer);
2949 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00002950 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002951
Serhiy Storchaka05997252013-01-26 12:14:02 +02002952 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002953 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002954 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002955}
2956
Victor Stinner600d3be2010-06-10 12:00:55 +00002957/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002958 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2959 1 on success. */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01002960int
2961_Py_normalize_encoding(const char *encoding,
2962 char *lower,
2963 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002964{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002965 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002966 char *l;
2967 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002968
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04002969 if (encoding == NULL) {
Victor Stinner66b32702013-11-07 23:12:23 +01002970 /* 6 == strlen("utf-8") + 1 */
Victor Stinnerdf23e302013-11-07 13:33:36 +01002971 if (lower_len < 6)
2972 return 0;
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04002973 strcpy(lower, "utf-8");
2974 return 1;
2975 }
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002976 e = encoding;
2977 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002978 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002979 while (*e) {
2980 if (l == l_end)
2981 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002982 if (Py_ISUPPER(*e)) {
2983 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002984 }
2985 else if (*e == '_') {
2986 *l++ = '-';
2987 e++;
2988 }
2989 else {
2990 *l++ = *e++;
2991 }
2992 }
2993 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002994 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002995}
2996
Alexander Belopolsky40018472011-02-26 01:02:56 +00002997PyObject *
2998PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002999 Py_ssize_t size,
3000 const char *encoding,
3001 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00003002{
3003 PyObject *buffer = NULL, *unicode;
3004 Py_buffer info;
3005 char lower[11]; /* Enough for any encoding shortcut */
3006
Fred Drakee4315f52000-05-09 19:53:39 +00003007 /* Shortcuts for common default encodings */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003008 if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003009 if ((strcmp(lower, "utf-8") == 0) ||
3010 (strcmp(lower, "utf8") == 0))
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003011 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
Victor Stinner37296e82010-06-10 13:36:23 +00003012 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003013 (strcmp(lower, "latin1") == 0) ||
Victor Stinnerfa3ba4c2013-10-29 11:34:05 +01003014 (strcmp(lower, "iso-8859-1") == 0) ||
3015 (strcmp(lower, "iso8859-1") == 0))
Victor Stinner37296e82010-06-10 13:36:23 +00003016 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003017#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00003018 else if (strcmp(lower, "mbcs") == 0)
3019 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003020#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003021 else if (strcmp(lower, "ascii") == 0)
3022 return PyUnicode_DecodeASCII(s, size, errors);
3023 else if (strcmp(lower, "utf-16") == 0)
3024 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3025 else if (strcmp(lower, "utf-32") == 0)
3026 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3027 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003028
3029 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003030 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00003031 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003032 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003033 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003034 if (buffer == NULL)
3035 goto onError;
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003036 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003037 if (unicode == NULL)
3038 goto onError;
3039 if (!PyUnicode_Check(unicode)) {
3040 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003041 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3042 "use codecs.decode() to decode to arbitrary types",
3043 encoding,
3044 Py_TYPE(unicode)->tp_name, Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003045 Py_DECREF(unicode);
3046 goto onError;
3047 }
3048 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003049 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003050
Benjamin Peterson29060642009-01-31 22:14:21 +00003051 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003052 Py_XDECREF(buffer);
3053 return NULL;
3054}
3055
Alexander Belopolsky40018472011-02-26 01:02:56 +00003056PyObject *
3057PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003058 const char *encoding,
3059 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003060{
3061 PyObject *v;
3062
3063 if (!PyUnicode_Check(unicode)) {
3064 PyErr_BadArgument();
3065 goto onError;
3066 }
3067
3068 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003069 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003070
3071 /* Decode via the codec registry */
3072 v = PyCodec_Decode(unicode, encoding, errors);
3073 if (v == NULL)
3074 goto onError;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003075 return unicode_result(v);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003076
Benjamin Peterson29060642009-01-31 22:14:21 +00003077 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003078 return NULL;
3079}
3080
Alexander Belopolsky40018472011-02-26 01:02:56 +00003081PyObject *
3082PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003083 const char *encoding,
3084 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003085{
3086 PyObject *v;
3087
3088 if (!PyUnicode_Check(unicode)) {
3089 PyErr_BadArgument();
3090 goto onError;
3091 }
3092
3093 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003094 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003095
3096 /* Decode via the codec registry */
3097 v = PyCodec_Decode(unicode, encoding, errors);
3098 if (v == NULL)
3099 goto onError;
3100 if (!PyUnicode_Check(v)) {
3101 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003102 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3103 "use codecs.decode() to decode to arbitrary types",
3104 encoding,
3105 Py_TYPE(unicode)->tp_name, Py_TYPE(unicode)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003106 Py_DECREF(v);
3107 goto onError;
3108 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003109 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003110
Benjamin Peterson29060642009-01-31 22:14:21 +00003111 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003112 return NULL;
3113}
3114
Alexander Belopolsky40018472011-02-26 01:02:56 +00003115PyObject *
3116PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003117 Py_ssize_t size,
3118 const char *encoding,
3119 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003120{
3121 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003122
Guido van Rossumd57fd912000-03-10 22:53:23 +00003123 unicode = PyUnicode_FromUnicode(s, size);
3124 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003125 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003126 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3127 Py_DECREF(unicode);
3128 return v;
3129}
3130
Alexander Belopolsky40018472011-02-26 01:02:56 +00003131PyObject *
3132PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003133 const char *encoding,
3134 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003135{
3136 PyObject *v;
3137
3138 if (!PyUnicode_Check(unicode)) {
3139 PyErr_BadArgument();
3140 goto onError;
3141 }
3142
3143 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003144 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003145
3146 /* Encode via the codec registry */
3147 v = PyCodec_Encode(unicode, encoding, errors);
3148 if (v == NULL)
3149 goto onError;
3150 return v;
3151
Benjamin Peterson29060642009-01-31 22:14:21 +00003152 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003153 return NULL;
3154}
3155
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003156static size_t
3157wcstombs_errorpos(const wchar_t *wstr)
3158{
3159 size_t len;
3160#if SIZEOF_WCHAR_T == 2
3161 wchar_t buf[3];
3162#else
3163 wchar_t buf[2];
3164#endif
3165 char outbuf[MB_LEN_MAX];
3166 const wchar_t *start, *previous;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003167
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003168#if SIZEOF_WCHAR_T == 2
3169 buf[2] = 0;
3170#else
3171 buf[1] = 0;
3172#endif
3173 start = wstr;
3174 while (*wstr != L'\0')
3175 {
3176 previous = wstr;
3177#if SIZEOF_WCHAR_T == 2
3178 if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
3179 && Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
3180 {
3181 buf[0] = wstr[0];
3182 buf[1] = wstr[1];
3183 wstr += 2;
3184 }
3185 else {
3186 buf[0] = *wstr;
3187 buf[1] = 0;
3188 wstr++;
3189 }
3190#else
3191 buf[0] = *wstr;
3192 wstr++;
3193#endif
3194 len = wcstombs(outbuf, buf, sizeof(outbuf));
Victor Stinner2f197072011-12-17 07:08:30 +01003195 if (len == (size_t)-1)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003196 return previous - start;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003197 }
3198
3199 /* failed to find the unencodable character */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003200 return 0;
3201}
3202
Victor Stinner1b579672011-12-17 05:47:23 +01003203static int
3204locale_error_handler(const char *errors, int *surrogateescape)
3205{
3206 if (errors == NULL) {
3207 *surrogateescape = 0;
3208 return 0;
3209 }
3210
3211 if (strcmp(errors, "strict") == 0) {
3212 *surrogateescape = 0;
3213 return 0;
3214 }
Victor Stinner8dbd4212012-12-04 09:30:24 +01003215 if (strcmp(errors, "surrogateescape") == 0) {
Victor Stinner1b579672011-12-17 05:47:23 +01003216 *surrogateescape = 1;
3217 return 0;
3218 }
3219 PyErr_Format(PyExc_ValueError,
3220 "only 'strict' and 'surrogateescape' error handlers "
3221 "are supported, not '%s'",
3222 errors);
3223 return -1;
3224}
3225
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003226PyObject *
Victor Stinner1b579672011-12-17 05:47:23 +01003227PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003228{
3229 Py_ssize_t wlen, wlen2;
3230 wchar_t *wstr;
3231 PyObject *bytes = NULL;
3232 char *errmsg;
Raymond Hettingere56666d2013-08-04 11:51:03 -07003233 PyObject *reason = NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003234 PyObject *exc;
3235 size_t error_pos;
Victor Stinner1b579672011-12-17 05:47:23 +01003236 int surrogateescape;
3237
3238 if (locale_error_handler(errors, &surrogateescape) < 0)
3239 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003240
3241 wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3242 if (wstr == NULL)
3243 return NULL;
3244
3245 wlen2 = wcslen(wstr);
3246 if (wlen2 != wlen) {
3247 PyMem_Free(wstr);
3248 PyErr_SetString(PyExc_TypeError, "embedded null character");
3249 return NULL;
3250 }
3251
3252 if (surrogateescape) {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003253 /* "surrogateescape" error handler */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003254 char *str;
3255
3256 str = _Py_wchar2char(wstr, &error_pos);
3257 if (str == NULL) {
3258 if (error_pos == (size_t)-1) {
3259 PyErr_NoMemory();
3260 PyMem_Free(wstr);
3261 return NULL;
3262 }
3263 else {
3264 goto encode_error;
3265 }
3266 }
3267 PyMem_Free(wstr);
3268
3269 bytes = PyBytes_FromString(str);
3270 PyMem_Free(str);
3271 }
3272 else {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003273 /* strict mode */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003274 size_t len, len2;
3275
3276 len = wcstombs(NULL, wstr, 0);
3277 if (len == (size_t)-1) {
Victor Stinner2f197072011-12-17 07:08:30 +01003278 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003279 goto encode_error;
3280 }
3281
3282 bytes = PyBytes_FromStringAndSize(NULL, len);
3283 if (bytes == NULL) {
3284 PyMem_Free(wstr);
3285 return NULL;
3286 }
3287
3288 len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
3289 if (len2 == (size_t)-1 || len2 > len) {
Victor Stinner2f197072011-12-17 07:08:30 +01003290 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003291 goto encode_error;
3292 }
3293 PyMem_Free(wstr);
3294 }
3295 return bytes;
3296
3297encode_error:
3298 errmsg = strerror(errno);
3299 assert(errmsg != NULL);
Victor Stinner2f197072011-12-17 07:08:30 +01003300
3301 if (error_pos == (size_t)-1)
3302 error_pos = wcstombs_errorpos(wstr);
3303
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003304 PyMem_Free(wstr);
3305 Py_XDECREF(bytes);
3306
Victor Stinner2f197072011-12-17 07:08:30 +01003307 if (errmsg != NULL) {
3308 size_t errlen;
3309 wstr = _Py_char2wchar(errmsg, &errlen);
3310 if (wstr != NULL) {
3311 reason = PyUnicode_FromWideChar(wstr, errlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003312 PyMem_RawFree(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003313 } else
3314 errmsg = NULL;
3315 }
3316 if (errmsg == NULL)
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003317 reason = PyUnicode_FromString(
3318 "wcstombs() encountered an unencodable "
3319 "wide character");
3320 if (reason == NULL)
3321 return NULL;
3322
3323 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO",
3324 "locale", unicode,
3325 (Py_ssize_t)error_pos,
3326 (Py_ssize_t)(error_pos+1),
3327 reason);
3328 Py_DECREF(reason);
3329 if (exc != NULL) {
3330 PyCodec_StrictErrors(exc);
3331 Py_XDECREF(exc);
3332 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003333 return NULL;
3334}
3335
Victor Stinnerad158722010-10-27 00:25:46 +00003336PyObject *
3337PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003338{
Victor Stinner99b95382011-07-04 14:23:54 +02003339#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003340 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003341#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003342 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00003343#else
Victor Stinner793b5312011-04-27 00:24:21 +02003344 PyInterpreterState *interp = PyThreadState_GET()->interp;
3345 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3346 cannot use it to encode and decode filenames before it is loaded. Load
3347 the Python codec requires to encode at least its own filename. Use the C
3348 version of the locale codec until the codec registry is initialized and
3349 the Python codec is loaded.
3350
3351 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3352 cannot only rely on it: check also interp->fscodec_initialized for
3353 subinterpreters. */
3354 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003355 return PyUnicode_AsEncodedString(unicode,
3356 Py_FileSystemDefaultEncoding,
3357 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003358 }
3359 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003360 return PyUnicode_EncodeLocale(unicode, "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003361 }
Victor Stinnerad158722010-10-27 00:25:46 +00003362#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003363}
3364
Alexander Belopolsky40018472011-02-26 01:02:56 +00003365PyObject *
3366PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003367 const char *encoding,
3368 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003369{
3370 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00003371 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003372
Guido van Rossumd57fd912000-03-10 22:53:23 +00003373 if (!PyUnicode_Check(unicode)) {
3374 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003375 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003376 }
Fred Drakee4315f52000-05-09 19:53:39 +00003377
Fred Drakee4315f52000-05-09 19:53:39 +00003378 /* Shortcuts for common default encodings */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003379 if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003380 if ((strcmp(lower, "utf-8") == 0) ||
3381 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00003382 {
Victor Stinner2f283c22011-03-02 01:21:46 +00003383 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003384 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00003385 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003386 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00003387 }
Victor Stinner37296e82010-06-10 13:36:23 +00003388 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003389 (strcmp(lower, "latin1") == 0) ||
Victor Stinnerfa3ba4c2013-10-29 11:34:05 +01003390 (strcmp(lower, "iso-8859-1") == 0) ||
3391 (strcmp(lower, "iso8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003392 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003393#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003394 else if (strcmp(lower, "mbcs") == 0)
3395 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003396#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003397 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003398 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00003399 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003400
3401 /* Encode via the codec registry */
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003402 v = _PyCodec_EncodeText(unicode, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003403 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003404 return NULL;
3405
3406 /* The normal path */
3407 if (PyBytes_Check(v))
3408 return v;
3409
3410 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003411 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003412 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003413 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003414
3415 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003416 "encoder %s returned bytearray instead of bytes; "
3417 "use codecs.encode() to encode to arbitrary types",
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003418 encoding);
3419 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003420 Py_DECREF(v);
3421 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003422 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003423
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003424 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3425 Py_DECREF(v);
3426 return b;
3427 }
3428
3429 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003430 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
3431 "use codecs.encode() to encode to arbitrary types",
3432 encoding,
3433 Py_TYPE(v)->tp_name, Py_TYPE(v)->tp_name);
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003434 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003435 return NULL;
3436}
3437
Alexander Belopolsky40018472011-02-26 01:02:56 +00003438PyObject *
3439PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003440 const char *encoding,
3441 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003442{
3443 PyObject *v;
3444
3445 if (!PyUnicode_Check(unicode)) {
3446 PyErr_BadArgument();
3447 goto onError;
3448 }
3449
3450 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003451 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003452
3453 /* Encode via the codec registry */
3454 v = PyCodec_Encode(unicode, encoding, errors);
3455 if (v == NULL)
3456 goto onError;
3457 if (!PyUnicode_Check(v)) {
3458 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003459 "'%.400s' encoder returned '%.400s' instead of 'str'; "
3460 "use codecs.encode() to encode to arbitrary types",
3461 encoding,
3462 Py_TYPE(v)->tp_name, Py_TYPE(v)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003463 Py_DECREF(v);
3464 goto onError;
3465 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003466 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003467
Benjamin Peterson29060642009-01-31 22:14:21 +00003468 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003469 return NULL;
3470}
3471
Victor Stinner2f197072011-12-17 07:08:30 +01003472static size_t
3473mbstowcs_errorpos(const char *str, size_t len)
3474{
3475#ifdef HAVE_MBRTOWC
3476 const char *start = str;
3477 mbstate_t mbs;
3478 size_t converted;
3479 wchar_t ch;
3480
3481 memset(&mbs, 0, sizeof mbs);
3482 while (len)
3483 {
3484 converted = mbrtowc(&ch, (char*)str, len, &mbs);
3485 if (converted == 0)
3486 /* Reached end of string */
3487 break;
3488 if (converted == (size_t)-1 || converted == (size_t)-2) {
3489 /* Conversion error or incomplete character */
3490 return str - start;
3491 }
3492 else {
3493 str += converted;
3494 len -= converted;
3495 }
3496 }
3497 /* failed to find the undecodable byte sequence */
3498 return 0;
3499#endif
3500 return 0;
3501}
3502
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003503PyObject*
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003504PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
Victor Stinner1b579672011-12-17 05:47:23 +01003505 const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003506{
3507 wchar_t smallbuf[256];
3508 size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
3509 wchar_t *wstr;
3510 size_t wlen, wlen2;
3511 PyObject *unicode;
Victor Stinner1b579672011-12-17 05:47:23 +01003512 int surrogateescape;
Victor Stinner2f197072011-12-17 07:08:30 +01003513 size_t error_pos;
3514 char *errmsg;
3515 PyObject *reason, *exc;
Victor Stinner1b579672011-12-17 05:47:23 +01003516
3517 if (locale_error_handler(errors, &surrogateescape) < 0)
3518 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003519
3520 if (str[len] != '\0' || len != strlen(str)) {
3521 PyErr_SetString(PyExc_TypeError, "embedded null character");
3522 return NULL;
3523 }
3524
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003525 if (surrogateescape) {
3526 /* "surrogateescape" error handler */
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003527 wstr = _Py_char2wchar(str, &wlen);
3528 if (wstr == NULL) {
3529 if (wlen == (size_t)-1)
3530 PyErr_NoMemory();
3531 else
3532 PyErr_SetFromErrno(PyExc_OSError);
3533 return NULL;
3534 }
3535
3536 unicode = PyUnicode_FromWideChar(wstr, wlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003537 PyMem_RawFree(wstr);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003538 }
3539 else {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003540 /* strict mode */
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003541#ifndef HAVE_BROKEN_MBSTOWCS
3542 wlen = mbstowcs(NULL, str, 0);
3543#else
3544 wlen = len;
3545#endif
Victor Stinner2f197072011-12-17 07:08:30 +01003546 if (wlen == (size_t)-1)
3547 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003548 if (wlen+1 <= smallbuf_len) {
3549 wstr = smallbuf;
3550 }
3551 else {
3552 if (wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1)
3553 return PyErr_NoMemory();
3554
3555 wstr = PyMem_Malloc((wlen+1) * sizeof(wchar_t));
3556 if (!wstr)
3557 return PyErr_NoMemory();
3558 }
3559
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003560 wlen2 = mbstowcs(wstr, str, wlen+1);
3561 if (wlen2 == (size_t)-1) {
3562 if (wstr != smallbuf)
3563 PyMem_Free(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003564 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003565 }
3566#ifdef HAVE_BROKEN_MBSTOWCS
3567 assert(wlen2 == wlen);
3568#endif
3569 unicode = PyUnicode_FromWideChar(wstr, wlen2);
3570 if (wstr != smallbuf)
3571 PyMem_Free(wstr);
3572 }
3573 return unicode;
Victor Stinner2f197072011-12-17 07:08:30 +01003574
3575decode_error:
3576 errmsg = strerror(errno);
3577 assert(errmsg != NULL);
3578
3579 error_pos = mbstowcs_errorpos(str, len);
3580 if (errmsg != NULL) {
3581 size_t errlen;
3582 wstr = _Py_char2wchar(errmsg, &errlen);
3583 if (wstr != NULL) {
3584 reason = PyUnicode_FromWideChar(wstr, errlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003585 PyMem_RawFree(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003586 } else
3587 errmsg = NULL;
3588 }
3589 if (errmsg == NULL)
3590 reason = PyUnicode_FromString(
3591 "mbstowcs() encountered an invalid multibyte sequence");
3592 if (reason == NULL)
3593 return NULL;
3594
3595 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO",
3596 "locale", str, len,
3597 (Py_ssize_t)error_pos,
3598 (Py_ssize_t)(error_pos+1),
3599 reason);
3600 Py_DECREF(reason);
3601 if (exc != NULL) {
3602 PyCodec_StrictErrors(exc);
3603 Py_XDECREF(exc);
3604 }
3605 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003606}
3607
3608PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003609PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003610{
3611 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner1b579672011-12-17 05:47:23 +01003612 return PyUnicode_DecodeLocaleAndSize(str, size, errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003613}
3614
3615
3616PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003617PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003618 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003619 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3620}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003621
Christian Heimes5894ba72007-11-04 11:43:14 +00003622PyObject*
3623PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3624{
Victor Stinner99b95382011-07-04 14:23:54 +02003625#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003626 return PyUnicode_DecodeMBCS(s, size, NULL);
3627#elif defined(__APPLE__)
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003628 return PyUnicode_DecodeUTF8Stateful(s, size, "surrogateescape", NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003629#else
Victor Stinner793b5312011-04-27 00:24:21 +02003630 PyInterpreterState *interp = PyThreadState_GET()->interp;
3631 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3632 cannot use it to encode and decode filenames before it is loaded. Load
3633 the Python codec requires to encode at least its own filename. Use the C
3634 version of the locale codec until the codec registry is initialized and
3635 the Python codec is loaded.
3636
3637 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3638 cannot only rely on it: check also interp->fscodec_initialized for
3639 subinterpreters. */
3640 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003641 return PyUnicode_Decode(s, size,
3642 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00003643 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003644 }
3645 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003646 return PyUnicode_DecodeLocaleAndSize(s, size, "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003647 }
Victor Stinnerad158722010-10-27 00:25:46 +00003648#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003649}
3650
Martin v. Löwis011e8422009-05-05 04:43:17 +00003651
3652int
Victor Stinnerfe75fb42012-10-23 02:52:18 +02003653_PyUnicode_HasNULChars(PyObject* str)
Antoine Pitrou13348842012-01-29 18:36:34 +01003654{
Victor Stinnerfe75fb42012-10-23 02:52:18 +02003655 Py_ssize_t pos;
Antoine Pitrou13348842012-01-29 18:36:34 +01003656
Victor Stinnerfe75fb42012-10-23 02:52:18 +02003657 if (PyUnicode_READY(str) == -1)
Antoine Pitrou13348842012-01-29 18:36:34 +01003658 return -1;
Victor Stinnerfe75fb42012-10-23 02:52:18 +02003659 pos = findchar(PyUnicode_DATA(str), PyUnicode_KIND(str),
3660 PyUnicode_GET_LENGTH(str), '\0', 1);
3661 if (pos == -1)
3662 return 0;
3663 else
3664 return 1;
Antoine Pitrou13348842012-01-29 18:36:34 +01003665}
3666
Antoine Pitrou13348842012-01-29 18:36:34 +01003667int
Martin v. Löwis011e8422009-05-05 04:43:17 +00003668PyUnicode_FSConverter(PyObject* arg, void* addr)
3669{
3670 PyObject *output = NULL;
3671 Py_ssize_t size;
3672 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003673 if (arg == NULL) {
3674 Py_DECREF(*(PyObject**)addr);
3675 return 1;
3676 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00003677 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00003678 output = arg;
3679 Py_INCREF(output);
3680 }
3681 else {
3682 arg = PyUnicode_FromObject(arg);
3683 if (!arg)
3684 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00003685 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003686 Py_DECREF(arg);
3687 if (!output)
3688 return 0;
3689 if (!PyBytes_Check(output)) {
3690 Py_DECREF(output);
3691 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3692 return 0;
3693 }
3694 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00003695 size = PyBytes_GET_SIZE(output);
3696 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003697 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05003698 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003699 Py_DECREF(output);
3700 return 0;
3701 }
3702 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003703 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003704}
3705
3706
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003707int
3708PyUnicode_FSDecoder(PyObject* arg, void* addr)
3709{
3710 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003711 if (arg == NULL) {
3712 Py_DECREF(*(PyObject**)addr);
3713 return 1;
3714 }
3715 if (PyUnicode_Check(arg)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003716 if (PyUnicode_READY(arg) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003717 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003718 output = arg;
3719 Py_INCREF(output);
3720 }
3721 else {
3722 arg = PyBytes_FromObject(arg);
3723 if (!arg)
3724 return 0;
3725 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3726 PyBytes_GET_SIZE(arg));
3727 Py_DECREF(arg);
3728 if (!output)
3729 return 0;
3730 if (!PyUnicode_Check(output)) {
3731 Py_DECREF(output);
3732 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3733 return 0;
3734 }
3735 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003736 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003737 Py_DECREF(output);
3738 return 0;
3739 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003740 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003741 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003742 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3743 Py_DECREF(output);
3744 return 0;
3745 }
3746 *(PyObject**)addr = output;
3747 return Py_CLEANUP_SUPPORTED;
3748}
3749
3750
Martin v. Löwis5b222132007-06-10 09:51:05 +00003751char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003752PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003753{
Christian Heimesf3863112007-11-22 07:46:41 +00003754 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003755
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003756 if (!PyUnicode_Check(unicode)) {
3757 PyErr_BadArgument();
3758 return NULL;
3759 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003760 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003761 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003762
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003763 if (PyUnicode_UTF8(unicode) == NULL) {
3764 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003765 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3766 if (bytes == NULL)
3767 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003768 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3769 if (_PyUnicode_UTF8(unicode) == NULL) {
Victor Stinnera5afb582013-10-29 01:28:23 +01003770 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003771 Py_DECREF(bytes);
3772 return NULL;
3773 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003774 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3775 Py_MEMCPY(_PyUnicode_UTF8(unicode),
3776 PyBytes_AS_STRING(bytes),
3777 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003778 Py_DECREF(bytes);
3779 }
3780
3781 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003782 *psize = PyUnicode_UTF8_LENGTH(unicode);
3783 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003784}
3785
3786char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003787PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003788{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003789 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3790}
3791
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003792Py_UNICODE *
3793PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3794{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003795 const unsigned char *one_byte;
3796#if SIZEOF_WCHAR_T == 4
3797 const Py_UCS2 *two_bytes;
3798#else
3799 const Py_UCS4 *four_bytes;
3800 const Py_UCS4 *ucs4_end;
3801 Py_ssize_t num_surrogates;
3802#endif
3803 wchar_t *w;
3804 wchar_t *wchar_end;
3805
3806 if (!PyUnicode_Check(unicode)) {
3807 PyErr_BadArgument();
3808 return NULL;
3809 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003810 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003811 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003812 assert(_PyUnicode_KIND(unicode) != 0);
3813 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003814
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003815 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003816#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003817 four_bytes = PyUnicode_4BYTE_DATA(unicode);
3818 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003819 num_surrogates = 0;
3820
3821 for (; four_bytes < ucs4_end; ++four_bytes) {
3822 if (*four_bytes > 0xFFFF)
3823 ++num_surrogates;
3824 }
3825
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003826 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
3827 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
3828 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003829 PyErr_NoMemory();
3830 return NULL;
3831 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003832 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003833
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003834 w = _PyUnicode_WSTR(unicode);
3835 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
3836 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003837 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3838 if (*four_bytes > 0xFFFF) {
Victor Stinner8faf8212011-12-08 22:14:11 +01003839 assert(*four_bytes <= MAX_UNICODE);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003840 /* encode surrogate pair in this case */
Victor Stinner551ac952011-11-29 22:58:13 +01003841 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
3842 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003843 }
3844 else
3845 *w = *four_bytes;
3846
3847 if (w > wchar_end) {
3848 assert(0 && "Miscalculated string end");
3849 }
3850 }
3851 *w = 0;
3852#else
3853 /* sizeof(wchar_t) == 4 */
3854 Py_FatalError("Impossible unicode object state, wstr and str "
3855 "should share memory already.");
3856 return NULL;
3857#endif
3858 }
3859 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003860 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3861 (_PyUnicode_LENGTH(unicode) + 1));
3862 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003863 PyErr_NoMemory();
3864 return NULL;
3865 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003866 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
3867 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
3868 w = _PyUnicode_WSTR(unicode);
3869 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003870
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003871 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3872 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003873 for (; w < wchar_end; ++one_byte, ++w)
3874 *w = *one_byte;
3875 /* null-terminate the wstr */
3876 *w = 0;
3877 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003878 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003879#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003880 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003881 for (; w < wchar_end; ++two_bytes, ++w)
3882 *w = *two_bytes;
3883 /* null-terminate the wstr */
3884 *w = 0;
3885#else
3886 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003887 PyObject_FREE(_PyUnicode_WSTR(unicode));
3888 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003889 Py_FatalError("Impossible unicode object state, wstr "
3890 "and str should share memory already.");
3891 return NULL;
3892#endif
3893 }
3894 else {
3895 assert(0 && "This should never happen.");
3896 }
3897 }
3898 }
3899 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003900 *size = PyUnicode_WSTR_LENGTH(unicode);
3901 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003902}
3903
Alexander Belopolsky40018472011-02-26 01:02:56 +00003904Py_UNICODE *
3905PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003906{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003907 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003908}
3909
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003910
Alexander Belopolsky40018472011-02-26 01:02:56 +00003911Py_ssize_t
3912PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003913{
3914 if (!PyUnicode_Check(unicode)) {
3915 PyErr_BadArgument();
3916 goto onError;
3917 }
3918 return PyUnicode_GET_SIZE(unicode);
3919
Benjamin Peterson29060642009-01-31 22:14:21 +00003920 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003921 return -1;
3922}
3923
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003924Py_ssize_t
3925PyUnicode_GetLength(PyObject *unicode)
3926{
Victor Stinner07621332012-06-16 04:53:46 +02003927 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003928 PyErr_BadArgument();
3929 return -1;
3930 }
Victor Stinner07621332012-06-16 04:53:46 +02003931 if (PyUnicode_READY(unicode) == -1)
3932 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003933 return PyUnicode_GET_LENGTH(unicode);
3934}
3935
3936Py_UCS4
3937PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3938{
Victor Stinner69ed0f42013-04-09 21:48:24 +02003939 void *data;
3940 int kind;
3941
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003942 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3943 PyErr_BadArgument();
3944 return (Py_UCS4)-1;
3945 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01003946 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003947 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003948 return (Py_UCS4)-1;
3949 }
Victor Stinner69ed0f42013-04-09 21:48:24 +02003950 data = PyUnicode_DATA(unicode);
3951 kind = PyUnicode_KIND(unicode);
3952 return PyUnicode_READ(kind, data, index);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003953}
3954
3955int
3956PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3957{
3958 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003959 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003960 return -1;
3961 }
Victor Stinner488fa492011-12-12 00:01:39 +01003962 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01003963 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003964 PyErr_SetString(PyExc_IndexError, "string index out of range");
3965 return -1;
3966 }
Victor Stinner488fa492011-12-12 00:01:39 +01003967 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02003968 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01003969 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
3970 PyErr_SetString(PyExc_ValueError, "character out of range");
3971 return -1;
3972 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003973 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3974 index, ch);
3975 return 0;
3976}
3977
Alexander Belopolsky40018472011-02-26 01:02:56 +00003978const char *
3979PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00003980{
Victor Stinner42cb4622010-09-01 19:39:01 +00003981 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00003982}
3983
Victor Stinner554f3f02010-06-16 23:33:54 +00003984/* create or adjust a UnicodeDecodeError */
3985static void
3986make_decode_exception(PyObject **exceptionObject,
3987 const char *encoding,
3988 const char *input, Py_ssize_t length,
3989 Py_ssize_t startpos, Py_ssize_t endpos,
3990 const char *reason)
3991{
3992 if (*exceptionObject == NULL) {
3993 *exceptionObject = PyUnicodeDecodeError_Create(
3994 encoding, input, length, startpos, endpos, reason);
3995 }
3996 else {
3997 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
3998 goto onError;
3999 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4000 goto onError;
4001 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4002 goto onError;
4003 }
4004 return;
4005
4006onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02004007 Py_CLEAR(*exceptionObject);
Victor Stinner554f3f02010-06-16 23:33:54 +00004008}
4009
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004010#ifdef HAVE_MBCS
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004011/* error handling callback helper:
4012 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00004013 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004014 and adjust various state variables.
4015 return 0 on success, -1 on error
4016*/
4017
Alexander Belopolsky40018472011-02-26 01:02:56 +00004018static int
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004019unicode_decode_call_errorhandler_wchar(
4020 const char *errors, PyObject **errorHandler,
4021 const char *encoding, const char *reason,
4022 const char **input, const char **inend, Py_ssize_t *startinpos,
4023 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4024 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004025{
Benjamin Peterson142957c2008-07-04 19:55:29 +00004026 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004027
4028 PyObject *restuple = NULL;
4029 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01004030 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004031 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004032 Py_ssize_t requiredsize;
4033 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004034 PyObject *inputobj = NULL;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004035 wchar_t *repwstr;
4036 Py_ssize_t repwlen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004037
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004038 assert (_PyUnicode_KIND(*output) == PyUnicode_WCHAR_KIND);
4039 outsize = _PyUnicode_WSTR_LENGTH(*output);
Victor Stinner596a6c42011-11-09 00:02:18 +01004040
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004041 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004042 *errorHandler = PyCodec_LookupError(errors);
4043 if (*errorHandler == NULL)
4044 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004045 }
4046
Victor Stinner554f3f02010-06-16 23:33:54 +00004047 make_decode_exception(exceptionObject,
4048 encoding,
4049 *input, *inend - *input,
4050 *startinpos, *endinpos,
4051 reason);
4052 if (*exceptionObject == NULL)
4053 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004054
4055 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4056 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004057 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004058 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00004059 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004060 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004061 }
4062 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004063 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004064
4065 /* Copy back the bytes variables, which might have been modified by the
4066 callback */
4067 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4068 if (!inputobj)
4069 goto onError;
4070 if (!PyBytes_Check(inputobj)) {
4071 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
4072 }
4073 *input = PyBytes_AS_STRING(inputobj);
4074 insize = PyBytes_GET_SIZE(inputobj);
4075 *inend = *input + insize;
4076 /* we can DECREF safely, as the exception has another reference,
4077 so the object won't go away. */
4078 Py_DECREF(inputobj);
4079
4080 if (newpos<0)
4081 newpos = insize+newpos;
4082 if (newpos<0 || newpos>insize) {
4083 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4084 goto onError;
4085 }
4086
4087 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4088 if (repwstr == NULL)
4089 goto onError;
4090 /* need more space? (at least enough for what we
4091 have+the replacement+the rest of the string (starting
4092 at the new input position), so we won't have to check space
4093 when there are no errors in the rest of the string) */
4094 requiredsize = *outpos + repwlen + insize-newpos;
4095 if (requiredsize > outsize) {
4096 if (requiredsize < 2*outsize)
4097 requiredsize = 2*outsize;
4098 if (unicode_resize(output, requiredsize) < 0)
4099 goto onError;
4100 }
4101 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4102 *outpos += repwlen;
4103
4104 *endinpos = newpos;
4105 *inptr = *input + newpos;
4106
4107 /* we made it! */
4108 Py_XDECREF(restuple);
4109 return 0;
4110
4111 onError:
4112 Py_XDECREF(restuple);
4113 return -1;
4114}
4115#endif /* HAVE_MBCS */
4116
4117static int
4118unicode_decode_call_errorhandler_writer(
4119 const char *errors, PyObject **errorHandler,
4120 const char *encoding, const char *reason,
4121 const char **input, const char **inend, Py_ssize_t *startinpos,
4122 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4123 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4124{
4125 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
4126
4127 PyObject *restuple = NULL;
4128 PyObject *repunicode = NULL;
4129 Py_ssize_t insize;
4130 Py_ssize_t newpos;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004131 Py_ssize_t replen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004132 PyObject *inputobj = NULL;
4133
4134 if (*errorHandler == NULL) {
4135 *errorHandler = PyCodec_LookupError(errors);
4136 if (*errorHandler == NULL)
4137 goto onError;
4138 }
4139
4140 make_decode_exception(exceptionObject,
4141 encoding,
4142 *input, *inend - *input,
4143 *startinpos, *endinpos,
4144 reason);
4145 if (*exceptionObject == NULL)
4146 goto onError;
4147
4148 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4149 if (restuple == NULL)
4150 goto onError;
4151 if (!PyTuple_Check(restuple)) {
4152 PyErr_SetString(PyExc_TypeError, &argparse[4]);
4153 goto onError;
4154 }
4155 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004156 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004157
4158 /* Copy back the bytes variables, which might have been modified by the
4159 callback */
4160 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4161 if (!inputobj)
4162 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00004163 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004164 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00004165 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004166 *input = PyBytes_AS_STRING(inputobj);
4167 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004168 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004169 /* we can DECREF safely, as the exception has another reference,
4170 so the object won't go away. */
4171 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004172
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004173 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004174 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004175 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004176 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4177 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004178 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004179
Victor Stinner8f674cc2013-04-17 23:02:17 +02004180 if (PyUnicode_READY(repunicode) < 0)
4181 goto onError;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004182 replen = PyUnicode_GET_LENGTH(repunicode);
4183 writer->min_length += replen;
4184 if (replen > 1)
Victor Stinner8f674cc2013-04-17 23:02:17 +02004185 writer->overallocate = 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004186 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
Victor Stinner376cfa12013-04-17 23:58:16 +02004187 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004188
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004189 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004190 *inptr = *input + newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004191
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004192 /* we made it! */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004193 Py_XDECREF(restuple);
4194 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004195
Benjamin Peterson29060642009-01-31 22:14:21 +00004196 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004197 Py_XDECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004198 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004199}
4200
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004201/* --- UTF-7 Codec -------------------------------------------------------- */
4202
Antoine Pitrou244651a2009-05-04 18:56:13 +00004203/* See RFC2152 for details. We encode conservatively and decode liberally. */
4204
4205/* Three simple macros defining base-64. */
4206
4207/* Is c a base-64 character? */
4208
4209#define IS_BASE64(c) \
4210 (((c) >= 'A' && (c) <= 'Z') || \
4211 ((c) >= 'a' && (c) <= 'z') || \
4212 ((c) >= '0' && (c) <= '9') || \
4213 (c) == '+' || (c) == '/')
4214
4215/* given that c is a base-64 character, what is its base-64 value? */
4216
4217#define FROM_BASE64(c) \
4218 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4219 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4220 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4221 (c) == '+' ? 62 : 63)
4222
4223/* What is the base-64 character of the bottom 6 bits of n? */
4224
4225#define TO_BASE64(n) \
4226 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4227
4228/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4229 * decoded as itself. We are permissive on decoding; the only ASCII
4230 * byte not decoding to itself is the + which begins a base64
4231 * string. */
4232
4233#define DECODE_DIRECT(c) \
4234 ((c) <= 127 && (c) != '+')
4235
4236/* The UTF-7 encoder treats ASCII characters differently according to
4237 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4238 * the above). See RFC2152. This array identifies these different
4239 * sets:
4240 * 0 : "Set D"
4241 * alphanumeric and '(),-./:?
4242 * 1 : "Set O"
4243 * !"#$%&*;<=>@[]^_`{|}
4244 * 2 : "whitespace"
4245 * ht nl cr sp
4246 * 3 : special (must be base64 encoded)
4247 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4248 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004249
Tim Petersced69f82003-09-16 20:30:58 +00004250static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004251char utf7_category[128] = {
4252/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4253 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4254/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4255 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4256/* sp ! " # $ % & ' ( ) * + , - . / */
4257 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4258/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4259 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4260/* @ A B C D E F G H I J K L M N O */
4261 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4262/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4263 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4264/* ` a b c d e f g h i j k l m n o */
4265 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4266/* p q r s t u v w x y z { | } ~ del */
4267 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004268};
4269
Antoine Pitrou244651a2009-05-04 18:56:13 +00004270/* ENCODE_DIRECT: this character should be encoded as itself. The
4271 * answer depends on whether we are encoding set O as itself, and also
4272 * on whether we are encoding whitespace as itself. RFC2152 makes it
4273 * clear that the answers to these questions vary between
4274 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004275
Antoine Pitrou244651a2009-05-04 18:56:13 +00004276#define ENCODE_DIRECT(c, directO, directWS) \
4277 ((c) < 128 && (c) > 0 && \
4278 ((utf7_category[(c)] == 0) || \
4279 (directWS && (utf7_category[(c)] == 2)) || \
4280 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004281
Alexander Belopolsky40018472011-02-26 01:02:56 +00004282PyObject *
4283PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004284 Py_ssize_t size,
4285 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004286{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004287 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4288}
4289
Antoine Pitrou244651a2009-05-04 18:56:13 +00004290/* The decoder. The only state we preserve is our read position,
4291 * i.e. how many characters we have consumed. So if we end in the
4292 * middle of a shift sequence we have to back off the read position
4293 * and the output to the beginning of the sequence, otherwise we lose
4294 * all the shift state (seen bits, number of bits seen, high
4295 * surrogate). */
4296
Alexander Belopolsky40018472011-02-26 01:02:56 +00004297PyObject *
4298PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004299 Py_ssize_t size,
4300 const char *errors,
4301 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004302{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004303 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004304 Py_ssize_t startinpos;
4305 Py_ssize_t endinpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004306 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004307 _PyUnicodeWriter writer;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004308 const char *errmsg = "";
4309 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004310 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004311 unsigned int base64bits = 0;
4312 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004313 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004314 PyObject *errorHandler = NULL;
4315 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004316
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004317 if (size == 0) {
4318 if (consumed)
4319 *consumed = 0;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004320 _Py_RETURN_UNICODE_EMPTY();
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004321 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004322
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004323 /* Start off assuming it's all ASCII. Widen later as necessary. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02004324 _PyUnicodeWriter_Init(&writer);
4325 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004326
4327 shiftOutStart = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004328 e = s + size;
4329
4330 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004331 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004332 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004333 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004334
Antoine Pitrou244651a2009-05-04 18:56:13 +00004335 if (inShift) { /* in a base-64 section */
4336 if (IS_BASE64(ch)) { /* consume a base-64 character */
4337 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4338 base64bits += 6;
4339 s++;
4340 if (base64bits >= 16) {
4341 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004342 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004343 base64bits -= 16;
4344 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004345 assert(outCh <= 0xffff);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004346 if (surrogate) {
4347 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004348 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4349 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004350 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004351 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004352 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004353 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004354 }
4355 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004356 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004357 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004358 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004359 }
4360 }
Victor Stinner551ac952011-11-29 22:58:13 +01004361 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004362 /* first surrogate */
4363 surrogate = outCh;
4364 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004365 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004366 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004367 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004368 }
4369 }
4370 }
4371 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004372 inShift = 0;
4373 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004374 if (surrogate) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004375 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004376 goto onError;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004377 surrogate = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004378 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004379 if (base64bits > 0) { /* left-over bits */
4380 if (base64bits >= 6) {
4381 /* We've seen at least one base-64 character */
4382 errmsg = "partial character in shift sequence";
4383 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004384 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004385 else {
4386 /* Some bits remain; they should be zero */
4387 if (base64buffer != 0) {
4388 errmsg = "non-zero padding bits in shift sequence";
4389 goto utf7Error;
4390 }
4391 }
4392 }
4393 if (ch != '-') {
4394 /* '-' is absorbed; other terminating
4395 characters are preserved */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004396 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004397 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004398 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004399 }
4400 }
4401 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004402 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004403 s++; /* consume '+' */
4404 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004405 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004406 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004407 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004408 }
4409 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004410 inShift = 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004411 shiftOutStart = writer.pos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004412 base64bits = 0;
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004413 base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004414 }
4415 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004416 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004417 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004418 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004419 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004420 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004421 else {
4422 startinpos = s-starts;
4423 s++;
4424 errmsg = "unexpected special character";
4425 goto utf7Error;
4426 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004427 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004428utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004429 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004430 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004431 errors, &errorHandler,
4432 "utf7", errmsg,
4433 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004434 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004435 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004436 }
4437
Antoine Pitrou244651a2009-05-04 18:56:13 +00004438 /* end of string */
4439
4440 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4441 /* if we're in an inconsistent state, that's an error */
4442 if (surrogate ||
4443 (base64bits >= 6) ||
4444 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004445 endinpos = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004446 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrou244651a2009-05-04 18:56:13 +00004447 errors, &errorHandler,
4448 "utf7", "unterminated shift sequence",
4449 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004450 &writer))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004451 goto onError;
4452 if (s < e)
4453 goto restart;
4454 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004455 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004456
4457 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004458 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004459 if (inShift) {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004460 *consumed = startinpos;
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004461 if (writer.pos != shiftOutStart && writer.maxchar > 127) {
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004462 PyObject *result = PyUnicode_FromKindAndData(
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004463 writer.kind, writer.data, shiftOutStart);
4464 Py_XDECREF(errorHandler);
4465 Py_XDECREF(exc);
4466 _PyUnicodeWriter_Dealloc(&writer);
4467 return result;
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004468 }
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004469 writer.pos = shiftOutStart; /* back off output */
Antoine Pitrou244651a2009-05-04 18:56:13 +00004470 }
4471 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004472 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004473 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004474 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004475
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004476 Py_XDECREF(errorHandler);
4477 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004478 return _PyUnicodeWriter_Finish(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004479
Benjamin Peterson29060642009-01-31 22:14:21 +00004480 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004481 Py_XDECREF(errorHandler);
4482 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004483 _PyUnicodeWriter_Dealloc(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004484 return NULL;
4485}
4486
4487
Alexander Belopolsky40018472011-02-26 01:02:56 +00004488PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004489_PyUnicode_EncodeUTF7(PyObject *str,
4490 int base64SetO,
4491 int base64WhiteSpace,
4492 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004493{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004494 int kind;
4495 void *data;
4496 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004497 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004498 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004499 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004500 unsigned int base64bits = 0;
4501 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004502 char * out;
4503 char * start;
4504
Benjamin Petersonbac79492012-01-14 13:34:47 -05004505 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004506 return NULL;
4507 kind = PyUnicode_KIND(str);
4508 data = PyUnicode_DATA(str);
4509 len = PyUnicode_GET_LENGTH(str);
4510
4511 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004512 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004513
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004514 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004515 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004516 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004517 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004518 if (v == NULL)
4519 return NULL;
4520
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004521 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004522 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004523 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004524
Antoine Pitrou244651a2009-05-04 18:56:13 +00004525 if (inShift) {
4526 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4527 /* shifting out */
4528 if (base64bits) { /* output remaining bits */
4529 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4530 base64buffer = 0;
4531 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004532 }
4533 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004534 /* Characters not in the BASE64 set implicitly unshift the sequence
4535 so no '-' is required, except if the character is itself a '-' */
4536 if (IS_BASE64(ch) || ch == '-') {
4537 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004538 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004539 *out++ = (char) ch;
4540 }
4541 else {
4542 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004543 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004544 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004545 else { /* not in a shift sequence */
4546 if (ch == '+') {
4547 *out++ = '+';
4548 *out++ = '-';
4549 }
4550 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4551 *out++ = (char) ch;
4552 }
4553 else {
4554 *out++ = '+';
4555 inShift = 1;
4556 goto encode_char;
4557 }
4558 }
4559 continue;
4560encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004561 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004562 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004563
Antoine Pitrou244651a2009-05-04 18:56:13 +00004564 /* code first surrogate */
4565 base64bits += 16;
Victor Stinner76df43d2012-10-30 01:42:39 +01004566 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004567 while (base64bits >= 6) {
4568 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4569 base64bits -= 6;
4570 }
4571 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004572 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004573 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004574 base64bits += 16;
4575 base64buffer = (base64buffer << 16) | ch;
4576 while (base64bits >= 6) {
4577 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4578 base64bits -= 6;
4579 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004580 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004581 if (base64bits)
4582 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4583 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004584 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004585 if (_PyBytes_Resize(&v, out - start) < 0)
4586 return NULL;
4587 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004588}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004589PyObject *
4590PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4591 Py_ssize_t size,
4592 int base64SetO,
4593 int base64WhiteSpace,
4594 const char *errors)
4595{
4596 PyObject *result;
4597 PyObject *tmp = PyUnicode_FromUnicode(s, size);
4598 if (tmp == NULL)
4599 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004600 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004601 base64WhiteSpace, errors);
4602 Py_DECREF(tmp);
4603 return result;
4604}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004605
Antoine Pitrou244651a2009-05-04 18:56:13 +00004606#undef IS_BASE64
4607#undef FROM_BASE64
4608#undef TO_BASE64
4609#undef DECODE_DIRECT
4610#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004611
Guido van Rossumd57fd912000-03-10 22:53:23 +00004612/* --- UTF-8 Codec -------------------------------------------------------- */
4613
Alexander Belopolsky40018472011-02-26 01:02:56 +00004614PyObject *
4615PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004616 Py_ssize_t size,
4617 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004618{
Walter Dörwald69652032004-09-07 20:24:22 +00004619 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4620}
4621
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004622#include "stringlib/asciilib.h"
4623#include "stringlib/codecs.h"
4624#include "stringlib/undef.h"
4625
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004626#include "stringlib/ucs1lib.h"
4627#include "stringlib/codecs.h"
4628#include "stringlib/undef.h"
4629
4630#include "stringlib/ucs2lib.h"
4631#include "stringlib/codecs.h"
4632#include "stringlib/undef.h"
4633
4634#include "stringlib/ucs4lib.h"
4635#include "stringlib/codecs.h"
4636#include "stringlib/undef.h"
4637
Antoine Pitrouab868312009-01-10 15:40:25 +00004638/* Mask to quickly check whether a C 'long' contains a
4639 non-ASCII, UTF8-encoded char. */
4640#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004641# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004642#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004643# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004644#else
4645# error C 'long' size should be either 4 or 8!
4646#endif
4647
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004648static Py_ssize_t
4649ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004650{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004651 const char *p = start;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004652 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004653
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004654 /*
4655 * Issue #17237: m68k is a bit different from most architectures in
4656 * that objects do not use "natural alignment" - for example, int and
4657 * long are only aligned at 2-byte boundaries. Therefore the assert()
4658 * won't work; also, tests have shown that skipping the "optimised
4659 * version" will even speed up m68k.
4660 */
4661#if !defined(__m68k__)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004662#if SIZEOF_LONG <= SIZEOF_VOID_P
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004663 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4664 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004665 /* Fast path, see in STRINGLIB(utf8_decode) for
4666 an explanation. */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004667 /* Help allocation */
4668 const char *_p = p;
4669 Py_UCS1 * q = dest;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004670 while (_p < aligned_end) {
4671 unsigned long value = *(const unsigned long *) _p;
4672 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00004673 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004674 *((unsigned long *)q) = value;
4675 _p += SIZEOF_LONG;
4676 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004677 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004678 p = _p;
4679 while (p < end) {
4680 if ((unsigned char)*p & 0x80)
4681 break;
4682 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004683 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004684 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004685 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004686#endif
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004687#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004688 while (p < end) {
4689 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4690 for an explanation. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004691 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004692 /* Help allocation */
4693 const char *_p = p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004694 while (_p < aligned_end) {
4695 unsigned long value = *(unsigned long *) _p;
4696 if (value & ASCII_CHAR_MASK)
4697 break;
4698 _p += SIZEOF_LONG;
4699 }
4700 p = _p;
4701 if (_p == end)
4702 break;
4703 }
4704 if ((unsigned char)*p & 0x80)
4705 break;
4706 ++p;
4707 }
4708 memcpy(dest, start, p - start);
4709 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004710}
Antoine Pitrouab868312009-01-10 15:40:25 +00004711
Victor Stinner785938e2011-12-11 20:09:03 +01004712PyObject *
4713PyUnicode_DecodeUTF8Stateful(const char *s,
4714 Py_ssize_t size,
4715 const char *errors,
4716 Py_ssize_t *consumed)
4717{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004718 _PyUnicodeWriter writer;
Victor Stinner785938e2011-12-11 20:09:03 +01004719 const char *starts = s;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004720 const char *end = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004721
4722 Py_ssize_t startinpos;
4723 Py_ssize_t endinpos;
4724 const char *errmsg = "";
4725 PyObject *errorHandler = NULL;
4726 PyObject *exc = NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004727
4728 if (size == 0) {
4729 if (consumed)
4730 *consumed = 0;
Serhiy Storchaka678db842013-01-26 12:16:36 +02004731 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner785938e2011-12-11 20:09:03 +01004732 }
4733
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004734 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4735 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner785938e2011-12-11 20:09:03 +01004736 if (consumed)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004737 *consumed = 1;
4738 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01004739 }
4740
Victor Stinner8f674cc2013-04-17 23:02:17 +02004741 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02004742 writer.min_length = size;
4743 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004744 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004745
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004746 writer.pos = ascii_decode(s, end, writer.data);
4747 s += writer.pos;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004748 while (s < end) {
4749 Py_UCS4 ch;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004750 int kind = writer.kind;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004751 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004752 if (PyUnicode_IS_ASCII(writer.buffer))
4753 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004754 else
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004755 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004756 } else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004757 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004758 } else {
4759 assert(kind == PyUnicode_4BYTE_KIND);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004760 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004761 }
4762
4763 switch (ch) {
4764 case 0:
4765 if (s == end || consumed)
4766 goto End;
4767 errmsg = "unexpected end of data";
4768 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004769 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004770 break;
4771 case 1:
4772 errmsg = "invalid start byte";
4773 startinpos = s - starts;
4774 endinpos = startinpos + 1;
4775 break;
4776 case 2:
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004777 case 3:
4778 case 4:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004779 errmsg = "invalid continuation byte";
4780 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004781 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004782 break;
4783 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004784 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004785 goto onError;
4786 continue;
4787 }
4788
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004789 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004790 errors, &errorHandler,
4791 "utf-8", errmsg,
4792 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004793 &writer))
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004794 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004795 }
4796
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004797End:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004798 if (consumed)
4799 *consumed = s - starts;
4800
4801 Py_XDECREF(errorHandler);
4802 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004803 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004804
4805onError:
4806 Py_XDECREF(errorHandler);
4807 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004808 _PyUnicodeWriter_Dealloc(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004809 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004810}
4811
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004812#ifdef __APPLE__
4813
4814/* Simplified UTF-8 decoder using surrogateescape error handler,
Victor Stinner0d92c4f2012-11-12 23:32:21 +01004815 used to decode the command line arguments on Mac OS X.
4816
4817 Return a pointer to a newly allocated wide character string (use
Victor Stinner6f8eeee2013-07-07 22:57:45 +02004818 PyMem_RawFree() to free the memory), or NULL on memory allocation error. */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004819
4820wchar_t*
4821_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4822{
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004823 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004824 wchar_t *unicode;
4825 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004826
4827 /* Note: size will always be longer than the resulting Unicode
4828 character count */
Victor Stinner0d92c4f2012-11-12 23:32:21 +01004829 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1))
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004830 return NULL;
Victor Stinner6f8eeee2013-07-07 22:57:45 +02004831 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004832 if (!unicode)
4833 return NULL;
4834
4835 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004836 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004837 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004838 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004839 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004840#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004841 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004842#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004843 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004844#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004845 if (ch > 0xFF) {
4846#if SIZEOF_WCHAR_T == 4
4847 assert(0);
4848#else
4849 assert(Py_UNICODE_IS_SURROGATE(ch));
4850 /* compute and append the two surrogates: */
4851 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
4852 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
4853#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004854 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004855 else {
4856 if (!ch && s == e)
4857 break;
4858 /* surrogateescape */
4859 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
4860 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004861 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004862 unicode[outpos] = L'\0';
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004863 return unicode;
4864}
4865
4866#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00004867
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004868/* Primary internal function which creates utf8 encoded bytes objects.
4869
4870 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00004871 and allocate exactly as much space needed at the end. Else allocate the
4872 maximum possible needed (4 result bytes per Unicode character), and return
4873 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004874*/
Tim Peters7e3d9612002-04-21 03:26:37 +00004875PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01004876_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004877{
Victor Stinner6099a032011-12-18 14:22:26 +01004878 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004879 void *data;
4880 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00004881
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004882 if (!PyUnicode_Check(unicode)) {
4883 PyErr_BadArgument();
4884 return NULL;
4885 }
4886
4887 if (PyUnicode_READY(unicode) == -1)
4888 return NULL;
4889
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004890 if (PyUnicode_UTF8(unicode))
4891 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4892 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004893
4894 kind = PyUnicode_KIND(unicode);
4895 data = PyUnicode_DATA(unicode);
4896 size = PyUnicode_GET_LENGTH(unicode);
4897
Benjamin Petersonead6b532011-12-20 17:23:42 -06004898 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01004899 default:
4900 assert(0);
4901 case PyUnicode_1BYTE_KIND:
4902 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
4903 assert(!PyUnicode_IS_ASCII(unicode));
4904 return ucs1lib_utf8_encoder(unicode, data, size, errors);
4905 case PyUnicode_2BYTE_KIND:
4906 return ucs2lib_utf8_encoder(unicode, data, size, errors);
4907 case PyUnicode_4BYTE_KIND:
4908 return ucs4lib_utf8_encoder(unicode, data, size, errors);
Tim Peters602f7402002-04-27 18:03:26 +00004909 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004910}
4911
Alexander Belopolsky40018472011-02-26 01:02:56 +00004912PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004913PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4914 Py_ssize_t size,
4915 const char *errors)
4916{
4917 PyObject *v, *unicode;
4918
4919 unicode = PyUnicode_FromUnicode(s, size);
4920 if (unicode == NULL)
4921 return NULL;
4922 v = _PyUnicode_AsUTF8String(unicode, errors);
4923 Py_DECREF(unicode);
4924 return v;
4925}
4926
4927PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004928PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004929{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004930 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004931}
4932
Walter Dörwald41980ca2007-08-16 21:55:45 +00004933/* --- UTF-32 Codec ------------------------------------------------------- */
4934
4935PyObject *
4936PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004937 Py_ssize_t size,
4938 const char *errors,
4939 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004940{
4941 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4942}
4943
4944PyObject *
4945PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004946 Py_ssize_t size,
4947 const char *errors,
4948 int *byteorder,
4949 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004950{
4951 const char *starts = s;
4952 Py_ssize_t startinpos;
4953 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004954 _PyUnicodeWriter writer;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004955 const unsigned char *q, *e;
Victor Stinnere64322e2012-10-30 23:12:47 +01004956 int le, bo = 0; /* assume native ordering by default */
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02004957 const char *encoding;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004958 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004959 PyObject *errorHandler = NULL;
4960 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004961
Walter Dörwald41980ca2007-08-16 21:55:45 +00004962 q = (unsigned char *)s;
4963 e = q + size;
4964
4965 if (byteorder)
4966 bo = *byteorder;
4967
4968 /* Check for BOM marks (U+FEFF) in the input and adjust current
4969 byte order setting accordingly. In native mode, the leading BOM
4970 mark is skipped, in all other modes, it is copied to the output
4971 stream as-is (giving a ZWNBSP character). */
Victor Stinnere64322e2012-10-30 23:12:47 +01004972 if (bo == 0 && size >= 4) {
4973 Py_UCS4 bom = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
4974 if (bom == 0x0000FEFF) {
4975 bo = -1;
4976 q += 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00004977 }
Victor Stinnere64322e2012-10-30 23:12:47 +01004978 else if (bom == 0xFFFE0000) {
4979 bo = 1;
4980 q += 4;
4981 }
4982 if (byteorder)
4983 *byteorder = bo;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004984 }
4985
Victor Stinnere64322e2012-10-30 23:12:47 +01004986 if (q == e) {
4987 if (consumed)
4988 *consumed = size;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004989 _Py_RETURN_UNICODE_EMPTY();
Walter Dörwald41980ca2007-08-16 21:55:45 +00004990 }
4991
Victor Stinnere64322e2012-10-30 23:12:47 +01004992#ifdef WORDS_BIGENDIAN
4993 le = bo < 0;
4994#else
4995 le = bo <= 0;
4996#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02004997 encoding = le ? "utf-32-le" : "utf-32-be";
Victor Stinnere64322e2012-10-30 23:12:47 +01004998
Victor Stinner8f674cc2013-04-17 23:02:17 +02004999 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005000 writer.min_length = (e - q + 3) / 4;
5001 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005002 goto onError;
Victor Stinnere64322e2012-10-30 23:12:47 +01005003
Victor Stinnere64322e2012-10-30 23:12:47 +01005004 while (1) {
5005 Py_UCS4 ch = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005006 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005007
Victor Stinnere64322e2012-10-30 23:12:47 +01005008 if (e - q >= 4) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005009 enum PyUnicode_Kind kind = writer.kind;
5010 void *data = writer.data;
Victor Stinnere64322e2012-10-30 23:12:47 +01005011 const unsigned char *last = e - 4;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005012 Py_ssize_t pos = writer.pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005013 if (le) {
5014 do {
5015 ch = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5016 if (ch > maxch)
5017 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005018 if (kind != PyUnicode_1BYTE_KIND &&
5019 Py_UNICODE_IS_SURROGATE(ch))
5020 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005021 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005022 q += 4;
5023 } while (q <= last);
5024 }
5025 else {
5026 do {
5027 ch = (q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
5028 if (ch > maxch)
5029 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005030 if (kind != PyUnicode_1BYTE_KIND &&
5031 Py_UNICODE_IS_SURROGATE(ch))
5032 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005033 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005034 q += 4;
5035 } while (q <= last);
5036 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005037 writer.pos = pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005038 }
5039
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005040 if (Py_UNICODE_IS_SURROGATE(ch)) {
5041 errmsg = "codepoint in surrogate code point range(0xd800, 0xe000)";
5042 startinpos = ((const char *)q) - starts;
5043 endinpos = startinpos + 4;
5044 }
5045 else if (ch <= maxch) {
Victor Stinnere64322e2012-10-30 23:12:47 +01005046 if (q == e || consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005047 break;
Victor Stinnere64322e2012-10-30 23:12:47 +01005048 /* remaining bytes at the end? (size should be divisible by 4) */
Benjamin Peterson29060642009-01-31 22:14:21 +00005049 errmsg = "truncated data";
Victor Stinnere64322e2012-10-30 23:12:47 +01005050 startinpos = ((const char *)q) - starts;
5051 endinpos = ((const char *)e) - starts;
Benjamin Peterson29060642009-01-31 22:14:21 +00005052 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005053 else {
5054 if (ch < 0x110000) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005055 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnere64322e2012-10-30 23:12:47 +01005056 goto onError;
5057 q += 4;
5058 continue;
5059 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005060 errmsg = "codepoint not in range(0x110000)";
Victor Stinnere64322e2012-10-30 23:12:47 +01005061 startinpos = ((const char *)q) - starts;
5062 endinpos = startinpos + 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005063 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005064
5065 /* The remaining input chars are ignored if the callback
5066 chooses to skip the input */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005067 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005068 errors, &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005069 encoding, errmsg,
Benjamin Peterson29060642009-01-31 22:14:21 +00005070 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005071 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005072 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005073 }
5074
Walter Dörwald41980ca2007-08-16 21:55:45 +00005075 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005076 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005077
Walter Dörwald41980ca2007-08-16 21:55:45 +00005078 Py_XDECREF(errorHandler);
5079 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005080 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005081
Benjamin Peterson29060642009-01-31 22:14:21 +00005082 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005083 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005084 Py_XDECREF(errorHandler);
5085 Py_XDECREF(exc);
5086 return NULL;
5087}
5088
5089PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005090_PyUnicode_EncodeUTF32(PyObject *str,
5091 const char *errors,
5092 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005093{
Serhiy Storchaka30793282014-01-04 22:44:01 +02005094 int kind;
5095 void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005096 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005097 PyObject *v;
Serhiy Storchaka30793282014-01-04 22:44:01 +02005098 unsigned char *p;
5099 Py_ssize_t nsize, i;
5100 /* Offsets from p for storing byte pairs in the right order. */
Christian Heimes743e0cd2012-10-17 23:52:17 +02005101#if PY_LITTLE_ENDIAN
Serhiy Storchaka30793282014-01-04 22:44:01 +02005102 int iorder[] = {0, 1, 2, 3};
Walter Dörwald41980ca2007-08-16 21:55:45 +00005103#else
Serhiy Storchaka30793282014-01-04 22:44:01 +02005104 int iorder[] = {3, 2, 1, 0};
Walter Dörwald41980ca2007-08-16 21:55:45 +00005105#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005106 const char *encoding;
5107 PyObject *errorHandler = NULL;
5108 PyObject *exc = NULL;
5109 PyObject *rep = NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005110
Serhiy Storchaka30793282014-01-04 22:44:01 +02005111#define STORECHAR(CH) \
5112 do { \
5113 p[iorder[3]] = ((CH) >> 24) & 0xff; \
5114 p[iorder[2]] = ((CH) >> 16) & 0xff; \
5115 p[iorder[1]] = ((CH) >> 8) & 0xff; \
5116 p[iorder[0]] = (CH) & 0xff; \
5117 p += 4; \
5118 } while(0)
5119
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005120 if (!PyUnicode_Check(str)) {
5121 PyErr_BadArgument();
5122 return NULL;
5123 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005124 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005125 return NULL;
5126 kind = PyUnicode_KIND(str);
5127 data = PyUnicode_DATA(str);
5128 len = PyUnicode_GET_LENGTH(str);
5129
Serhiy Storchaka583a9392014-01-04 19:25:37 +02005130 nsize = len + (byteorder == 0);
Serhiy Storchaka30793282014-01-04 22:44:01 +02005131 if (nsize > PY_SSIZE_T_MAX / 4)
5132 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005133 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005134 if (v == NULL)
5135 return NULL;
5136
Serhiy Storchaka30793282014-01-04 22:44:01 +02005137 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005138 if (byteorder == 0)
Serhiy Storchaka30793282014-01-04 22:44:01 +02005139 STORECHAR(0xFEFF);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005140 if (len == 0)
Serhiy Storchaka30793282014-01-04 22:44:01 +02005141 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005142
Serhiy Storchaka30793282014-01-04 22:44:01 +02005143 if (byteorder == -1) {
5144 /* force LE */
5145 iorder[0] = 0;
5146 iorder[1] = 1;
5147 iorder[2] = 2;
5148 iorder[3] = 3;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005149 encoding = "utf-32-le";
Serhiy Storchaka30793282014-01-04 22:44:01 +02005150 }
5151 else if (byteorder == 1) {
5152 /* force BE */
5153 iorder[0] = 3;
5154 iorder[1] = 2;
5155 iorder[2] = 1;
5156 iorder[3] = 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005157 encoding = "utf-32-be";
Serhiy Storchaka30793282014-01-04 22:44:01 +02005158 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005159 else
5160 encoding = "utf-32";
5161
5162 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchaka30793282014-01-04 22:44:01 +02005163 for (i = 0; i < len; i++)
5164 STORECHAR(PyUnicode_READ(kind, data, i));
5165 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005166 }
5167
Serhiy Storchaka30793282014-01-04 22:44:01 +02005168 for (i = 0; i < len;) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005169 Py_ssize_t repsize, moreunits;
Serhiy Storchaka30793282014-01-04 22:44:01 +02005170 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
5171 i++;
5172 assert(ch <= MAX_UNICODE);
5173 if (!Py_UNICODE_IS_SURROGATE(ch)) {
5174 STORECHAR(ch);
5175 continue;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005176 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005177
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005178 rep = unicode_encode_call_errorhandler(
5179 errors, &errorHandler,
5180 encoding, "surrogates not allowed",
Serhiy Storchaka30793282014-01-04 22:44:01 +02005181 str, &exc, i-1, i, &i);
5182
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005183 if (!rep)
5184 goto error;
5185
5186 if (PyBytes_Check(rep)) {
5187 repsize = PyBytes_GET_SIZE(rep);
5188 if (repsize & 3) {
5189 raise_encode_exception(&exc, encoding,
Serhiy Storchaka30793282014-01-04 22:44:01 +02005190 str, i - 1, i,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005191 "surrogates not allowed");
5192 goto error;
5193 }
5194 moreunits = repsize / 4;
5195 }
5196 else {
5197 assert(PyUnicode_Check(rep));
5198 if (PyUnicode_READY(rep) < 0)
5199 goto error;
5200 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5201 if (!PyUnicode_IS_ASCII(rep)) {
5202 raise_encode_exception(&exc, encoding,
Serhiy Storchaka30793282014-01-04 22:44:01 +02005203 str, i - 1, i,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005204 "surrogates not allowed");
5205 goto error;
5206 }
5207 }
5208
5209 /* four bytes are reserved for each surrogate */
5210 if (moreunits > 1) {
Serhiy Storchaka30793282014-01-04 22:44:01 +02005211 Py_ssize_t outpos = p - (unsigned char*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005212 Py_ssize_t morebytes = 4 * (moreunits - 1);
5213 if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) {
5214 /* integer overflow */
5215 PyErr_NoMemory();
5216 goto error;
5217 }
5218 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0)
5219 goto error;
Serhiy Storchaka30793282014-01-04 22:44:01 +02005220 p = (unsigned char*) PyBytes_AS_STRING(v) + outpos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005221 }
5222
5223 if (PyBytes_Check(rep)) {
Serhiy Storchaka30793282014-01-04 22:44:01 +02005224 Py_MEMCPY(p, PyBytes_AS_STRING(rep), repsize);
5225 p += repsize;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005226 } else /* rep is unicode */ {
Serhiy Storchaka30793282014-01-04 22:44:01 +02005227 const Py_UCS1 *repdata;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005228 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
Serhiy Storchaka30793282014-01-04 22:44:01 +02005229 repdata = PyUnicode_1BYTE_DATA(rep);
5230 while (repsize--) {
5231 Py_UCS4 ch = *repdata++;
5232 STORECHAR(ch);
5233 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005234 }
5235
5236 Py_CLEAR(rep);
5237 }
5238
5239 /* Cut back to size actually needed. This is necessary for, for example,
5240 encoding of a string containing isolated surrogates and the 'ignore'
5241 handler is used. */
Serhiy Storchaka30793282014-01-04 22:44:01 +02005242 nsize = p - (unsigned char*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005243 if (nsize != PyBytes_GET_SIZE(v))
5244 _PyBytes_Resize(&v, nsize);
5245 Py_XDECREF(errorHandler);
5246 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005247 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005248 error:
5249 Py_XDECREF(rep);
5250 Py_XDECREF(errorHandler);
5251 Py_XDECREF(exc);
5252 Py_XDECREF(v);
5253 return NULL;
Serhiy Storchaka30793282014-01-04 22:44:01 +02005254#undef STORECHAR
Walter Dörwald41980ca2007-08-16 21:55:45 +00005255}
5256
Alexander Belopolsky40018472011-02-26 01:02:56 +00005257PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005258PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5259 Py_ssize_t size,
5260 const char *errors,
5261 int byteorder)
5262{
5263 PyObject *result;
5264 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5265 if (tmp == NULL)
5266 return NULL;
5267 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5268 Py_DECREF(tmp);
5269 return result;
5270}
5271
5272PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005273PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005274{
Victor Stinnerb960b342011-11-20 19:12:52 +01005275 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005276}
5277
Guido van Rossumd57fd912000-03-10 22:53:23 +00005278/* --- UTF-16 Codec ------------------------------------------------------- */
5279
Tim Peters772747b2001-08-09 22:21:55 +00005280PyObject *
5281PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005282 Py_ssize_t size,
5283 const char *errors,
5284 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005285{
Walter Dörwald69652032004-09-07 20:24:22 +00005286 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5287}
5288
5289PyObject *
5290PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005291 Py_ssize_t size,
5292 const char *errors,
5293 int *byteorder,
5294 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005295{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005296 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005297 Py_ssize_t startinpos;
5298 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005299 _PyUnicodeWriter writer;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005300 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005301 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005302 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005303 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005304 PyObject *errorHandler = NULL;
5305 PyObject *exc = NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005306 const char *encoding;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005307
Tim Peters772747b2001-08-09 22:21:55 +00005308 q = (unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005309 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005310
5311 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005312 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005313
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005314 /* Check for BOM marks (U+FEFF) in the input and adjust current
5315 byte order setting accordingly. In native mode, the leading BOM
5316 mark is skipped, in all other modes, it is copied to the output
5317 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005318 if (bo == 0 && size >= 2) {
5319 const Py_UCS4 bom = (q[1] << 8) | q[0];
5320 if (bom == 0xFEFF) {
5321 q += 2;
5322 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005323 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005324 else if (bom == 0xFFFE) {
5325 q += 2;
5326 bo = 1;
5327 }
5328 if (byteorder)
5329 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005330 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005331
Antoine Pitrou63065d72012-05-15 23:48:04 +02005332 if (q == e) {
5333 if (consumed)
5334 *consumed = size;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005335 _Py_RETURN_UNICODE_EMPTY();
Tim Peters772747b2001-08-09 22:21:55 +00005336 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005337
Christian Heimes743e0cd2012-10-17 23:52:17 +02005338#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02005339 native_ordering = bo <= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005340 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
Antoine Pitrouab868312009-01-10 15:40:25 +00005341#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02005342 native_ordering = bo >= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005343 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
Antoine Pitrouab868312009-01-10 15:40:25 +00005344#endif
Tim Peters772747b2001-08-09 22:21:55 +00005345
Antoine Pitrou63065d72012-05-15 23:48:04 +02005346 /* Note: size will always be longer than the resulting Unicode
5347 character count */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005348 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005349 writer.min_length = (e - q + 1) / 2;
5350 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005351 goto onError;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005352
Antoine Pitrou63065d72012-05-15 23:48:04 +02005353 while (1) {
5354 Py_UCS4 ch = 0;
5355 if (e - q >= 2) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005356 int kind = writer.kind;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005357 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005358 if (PyUnicode_IS_ASCII(writer.buffer))
Antoine Pitrou63065d72012-05-15 23:48:04 +02005359 ch = asciilib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005360 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005361 native_ordering);
5362 else
5363 ch = ucs1lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005364 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005365 native_ordering);
5366 } else if (kind == PyUnicode_2BYTE_KIND) {
5367 ch = ucs2lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005368 (Py_UCS2*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005369 native_ordering);
5370 } else {
5371 assert(kind == PyUnicode_4BYTE_KIND);
5372 ch = ucs4lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005373 (Py_UCS4*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005374 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00005375 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005376 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005377
Antoine Pitrou63065d72012-05-15 23:48:04 +02005378 switch (ch)
5379 {
5380 case 0:
5381 /* remaining byte at the end? (size should be even) */
5382 if (q == e || consumed)
5383 goto End;
5384 errmsg = "truncated data";
5385 startinpos = ((const char *)q) - starts;
5386 endinpos = ((const char *)e) - starts;
5387 break;
5388 /* The remaining input chars are ignored if the callback
5389 chooses to skip the input */
5390 case 1:
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005391 q -= 2;
5392 if (consumed)
Serhiy Storchakaae3b32a2013-01-08 23:40:52 +02005393 goto End;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005394 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005395 startinpos = ((const char *)q) - starts;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005396 endinpos = ((const char *)e) - starts;
5397 break;
5398 case 2:
5399 errmsg = "illegal encoding";
5400 startinpos = ((const char *)q) - 2 - starts;
5401 endinpos = startinpos + 2;
5402 break;
5403 case 3:
5404 errmsg = "illegal UTF-16 surrogate";
5405 startinpos = ((const char *)q) - 4 - starts;
5406 endinpos = startinpos + 2;
5407 break;
5408 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005409 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005410 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005411 continue;
5412 }
5413
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005414 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouab868312009-01-10 15:40:25 +00005415 errors,
5416 &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005417 encoding, errmsg,
Antoine Pitrouab868312009-01-10 15:40:25 +00005418 &starts,
5419 (const char **)&e,
5420 &startinpos,
5421 &endinpos,
5422 &exc,
5423 (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005424 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005425 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005426 }
5427
Antoine Pitrou63065d72012-05-15 23:48:04 +02005428End:
Walter Dörwald69652032004-09-07 20:24:22 +00005429 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005430 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005431
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005432 Py_XDECREF(errorHandler);
5433 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005434 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005435
Benjamin Peterson29060642009-01-31 22:14:21 +00005436 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005437 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005438 Py_XDECREF(errorHandler);
5439 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005440 return NULL;
5441}
5442
Tim Peters772747b2001-08-09 22:21:55 +00005443PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005444_PyUnicode_EncodeUTF16(PyObject *str,
5445 const char *errors,
5446 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005447{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005448 enum PyUnicode_Kind kind;
5449 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005450 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005451 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005452 unsigned short *out;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005453 Py_ssize_t pairs;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005454#if PY_BIG_ENDIAN
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005455 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005456#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005457 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005458#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005459 const char *encoding;
5460 Py_ssize_t nsize, pos;
5461 PyObject *errorHandler = NULL;
5462 PyObject *exc = NULL;
5463 PyObject *rep = NULL;
Tim Peters772747b2001-08-09 22:21:55 +00005464
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005465 if (!PyUnicode_Check(str)) {
5466 PyErr_BadArgument();
5467 return NULL;
5468 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005469 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005470 return NULL;
5471 kind = PyUnicode_KIND(str);
5472 data = PyUnicode_DATA(str);
5473 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005474
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005475 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005476 if (kind == PyUnicode_4BYTE_KIND) {
5477 const Py_UCS4 *in = (const Py_UCS4 *)data;
5478 const Py_UCS4 *end = in + len;
5479 while (in < end)
5480 if (*in++ >= 0x10000)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005481 pairs++;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005482 }
5483 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005484 return PyErr_NoMemory();
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005485 nsize = len + pairs + (byteorder == 0);
5486 v = PyBytes_FromStringAndSize(NULL, nsize * 2);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005487 if (v == NULL)
5488 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005489
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005490 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02005491 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005492 out = (unsigned short *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005493 if (byteorder == 0)
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005494 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005495 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005496 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005497
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005498 if (kind == PyUnicode_1BYTE_KIND) {
5499 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5500 goto done;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005501 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005502
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005503 if (byteorder < 0)
5504 encoding = "utf-16-le";
5505 else if (byteorder > 0)
5506 encoding = "utf-16-be";
5507 else
5508 encoding = "utf-16";
5509
5510 pos = 0;
5511 while (pos < len) {
5512 Py_ssize_t repsize, moreunits;
5513
5514 if (kind == PyUnicode_2BYTE_KIND) {
5515 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
5516 &out, native_ordering);
5517 }
5518 else {
5519 assert(kind == PyUnicode_4BYTE_KIND);
5520 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
5521 &out, native_ordering);
5522 }
5523 if (pos == len)
5524 break;
5525
5526 rep = unicode_encode_call_errorhandler(
5527 errors, &errorHandler,
5528 encoding, "surrogates not allowed",
5529 str, &exc, pos, pos + 1, &pos);
5530 if (!rep)
5531 goto error;
5532
5533 if (PyBytes_Check(rep)) {
5534 repsize = PyBytes_GET_SIZE(rep);
5535 if (repsize & 1) {
5536 raise_encode_exception(&exc, encoding,
5537 str, pos - 1, pos,
5538 "surrogates not allowed");
5539 goto error;
5540 }
5541 moreunits = repsize / 2;
5542 }
5543 else {
5544 assert(PyUnicode_Check(rep));
5545 if (PyUnicode_READY(rep) < 0)
5546 goto error;
5547 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5548 if (!PyUnicode_IS_ASCII(rep)) {
5549 raise_encode_exception(&exc, encoding,
5550 str, pos - 1, pos,
5551 "surrogates not allowed");
5552 goto error;
5553 }
5554 }
5555
5556 /* two bytes are reserved for each surrogate */
5557 if (moreunits > 1) {
5558 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
5559 Py_ssize_t morebytes = 2 * (moreunits - 1);
5560 if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) {
5561 /* integer overflow */
5562 PyErr_NoMemory();
5563 goto error;
5564 }
5565 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0)
5566 goto error;
5567 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
5568 }
5569
5570 if (PyBytes_Check(rep)) {
5571 Py_MEMCPY(out, PyBytes_AS_STRING(rep), repsize);
5572 out += moreunits;
5573 } else /* rep is unicode */ {
5574 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
5575 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5576 &out, native_ordering);
5577 }
5578
5579 Py_CLEAR(rep);
5580 }
5581
5582 /* Cut back to size actually needed. This is necessary for, for example,
5583 encoding of a string containing isolated surrogates and the 'ignore' handler
5584 is used. */
5585 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
5586 if (nsize != PyBytes_GET_SIZE(v))
5587 _PyBytes_Resize(&v, nsize);
5588 Py_XDECREF(errorHandler);
5589 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00005590 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005591 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005592 error:
5593 Py_XDECREF(rep);
5594 Py_XDECREF(errorHandler);
5595 Py_XDECREF(exc);
5596 Py_XDECREF(v);
5597 return NULL;
5598#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005599}
5600
Alexander Belopolsky40018472011-02-26 01:02:56 +00005601PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005602PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5603 Py_ssize_t size,
5604 const char *errors,
5605 int byteorder)
5606{
5607 PyObject *result;
5608 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5609 if (tmp == NULL)
5610 return NULL;
5611 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5612 Py_DECREF(tmp);
5613 return result;
5614}
5615
5616PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005617PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005618{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005619 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005620}
5621
5622/* --- Unicode Escape Codec ----------------------------------------------- */
5623
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005624/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5625 if all the escapes in the string make it still a valid ASCII string.
5626 Returns -1 if any escapes were found which cause the string to
5627 pop out of ASCII range. Otherwise returns the length of the
5628 required buffer to hold the string.
5629 */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02005630static Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005631length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5632{
5633 const unsigned char *p = (const unsigned char *)s;
5634 const unsigned char *end = p + size;
5635 Py_ssize_t length = 0;
5636
5637 if (size < 0)
5638 return -1;
5639
5640 for (; p < end; ++p) {
5641 if (*p > 127) {
5642 /* Non-ASCII */
5643 return -1;
5644 }
5645 else if (*p != '\\') {
5646 /* Normal character */
5647 ++length;
5648 }
5649 else {
5650 /* Backslash-escape, check next char */
5651 ++p;
5652 /* Escape sequence reaches till end of string or
5653 non-ASCII follow-up. */
5654 if (p >= end || *p > 127)
5655 return -1;
5656 switch (*p) {
5657 case '\n':
5658 /* backslash + \n result in zero characters */
5659 break;
5660 case '\\': case '\'': case '\"':
5661 case 'b': case 'f': case 't':
5662 case 'n': case 'r': case 'v': case 'a':
5663 ++length;
5664 break;
5665 case '0': case '1': case '2': case '3':
5666 case '4': case '5': case '6': case '7':
5667 case 'x': case 'u': case 'U': case 'N':
5668 /* these do not guarantee ASCII characters */
5669 return -1;
5670 default:
5671 /* count the backslash + the other character */
5672 length += 2;
5673 }
5674 }
5675 }
5676 return length;
5677}
5678
Fredrik Lundh06d12682001-01-24 07:59:11 +00005679static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005680
Alexander Belopolsky40018472011-02-26 01:02:56 +00005681PyObject *
5682PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005683 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005684 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005685{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005686 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005687 Py_ssize_t startinpos;
5688 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005689 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005690 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005691 char* message;
5692 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005693 PyObject *errorHandler = NULL;
5694 PyObject *exc = NULL;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005695 Py_ssize_t len;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005696
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005697 len = length_of_escaped_ascii_string(s, size);
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005698 if (len == 0)
5699 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005700
5701 /* After length_of_escaped_ascii_string() there are two alternatives,
5702 either the string is pure ASCII with named escapes like \n, etc.
5703 and we determined it's exact size (common case)
5704 or it contains \x, \u, ... escape sequences. then we create a
5705 legacy wchar string and resize it at the end of this function. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005706 _PyUnicodeWriter_Init(&writer);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005707 if (len > 0) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02005708 writer.min_length = len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005709 }
5710 else {
5711 /* Escaped strings will always be longer than the resulting
5712 Unicode string, so we start with size here and then reduce the
5713 length after conversion to the true value.
5714 (but if the error callback returns a long replacement string
5715 we'll have to allocate more space) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005716 writer.min_length = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005717 }
5718
Guido van Rossumd57fd912000-03-10 22:53:23 +00005719 if (size == 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005720 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005721 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005722
Guido van Rossumd57fd912000-03-10 22:53:23 +00005723 while (s < end) {
5724 unsigned char c;
Victor Stinner24729f32011-11-10 20:31:37 +01005725 Py_UCS4 x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005726 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005727
5728 /* Non-escape characters are interpreted as Unicode ordinals */
5729 if (*s != '\\') {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005730 x = (unsigned char)*s;
5731 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005732 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005733 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005734 continue;
5735 }
5736
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005737 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005738 /* \ - Escapes */
5739 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005740 c = *s++;
5741 if (s > end)
5742 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005743
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005744 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005745
Benjamin Peterson29060642009-01-31 22:14:21 +00005746 /* \x escapes */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005747#define WRITECHAR(ch) \
5748 do { \
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005749 if (_PyUnicodeWriter_WriteCharInline(&writer, (ch)) < 0) \
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005750 goto onError; \
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005751 } while(0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005752
Guido van Rossumd57fd912000-03-10 22:53:23 +00005753 case '\n': break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005754 case '\\': WRITECHAR('\\'); break;
5755 case '\'': WRITECHAR('\''); break;
5756 case '\"': WRITECHAR('\"'); break;
5757 case 'b': WRITECHAR('\b'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005758 /* FF */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005759 case 'f': WRITECHAR('\014'); break;
5760 case 't': WRITECHAR('\t'); break;
5761 case 'n': WRITECHAR('\n'); break;
5762 case 'r': WRITECHAR('\r'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005763 /* VT */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005764 case 'v': WRITECHAR('\013'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005765 /* BEL, not classic C */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005766 case 'a': WRITECHAR('\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005767
Benjamin Peterson29060642009-01-31 22:14:21 +00005768 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005769 case '0': case '1': case '2': case '3':
5770 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005771 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005772 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005773 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005774 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005775 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005776 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005777 WRITECHAR(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005778 break;
5779
Benjamin Peterson29060642009-01-31 22:14:21 +00005780 /* hex escapes */
5781 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005782 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005783 digits = 2;
5784 message = "truncated \\xXX escape";
5785 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005786
Benjamin Peterson29060642009-01-31 22:14:21 +00005787 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005788 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005789 digits = 4;
5790 message = "truncated \\uXXXX escape";
5791 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005792
Benjamin Peterson29060642009-01-31 22:14:21 +00005793 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005794 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005795 digits = 8;
5796 message = "truncated \\UXXXXXXXX escape";
5797 hexescape:
5798 chr = 0;
Serhiy Storchakad6793772013-01-29 10:20:44 +02005799 if (end - s < digits) {
5800 /* count only hex digits */
5801 for (; s < end; ++s) {
5802 c = (unsigned char)*s;
5803 if (!Py_ISXDIGIT(c))
5804 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005805 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02005806 goto error;
5807 }
5808 for (; digits--; ++s) {
5809 c = (unsigned char)*s;
5810 if (!Py_ISXDIGIT(c))
5811 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005812 chr = (chr<<4) & ~0xF;
5813 if (c >= '0' && c <= '9')
5814 chr += c - '0';
5815 else if (c >= 'a' && c <= 'f')
5816 chr += 10 + c - 'a';
5817 else
5818 chr += 10 + c - 'A';
5819 }
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005820 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005821 /* _decoding_error will have already written into the
5822 target buffer. */
5823 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005824 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005825 /* when we get here, chr is a 32-bit unicode character */
Serhiy Storchaka24193de2013-01-29 10:28:07 +02005826 message = "illegal Unicode character";
5827 if (chr > MAX_UNICODE)
Serhiy Storchakad6793772013-01-29 10:20:44 +02005828 goto error;
Serhiy Storchaka24193de2013-01-29 10:28:07 +02005829 WRITECHAR(chr);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005830 break;
5831
Benjamin Peterson29060642009-01-31 22:14:21 +00005832 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005833 case 'N':
5834 message = "malformed \\N character escape";
5835 if (ucnhash_CAPI == NULL) {
5836 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005837 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5838 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005839 if (ucnhash_CAPI == NULL)
5840 goto ucnhashError;
5841 }
5842 if (*s == '{') {
5843 const char *start = s+1;
5844 /* look for the closing brace */
5845 while (*s != '}' && s < end)
5846 s++;
5847 if (s > start && s < end && *s == '}') {
5848 /* found a name. look it up in the unicode database */
5849 message = "unknown Unicode character name";
5850 s++;
Serhiy Storchaka4f5f0e52013-01-21 11:38:00 +02005851 if (s - start - 1 <= INT_MAX &&
Serhiy Storchakac35f3a92013-01-21 11:42:57 +02005852 ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
Ezio Melotti931b8aa2011-10-21 21:57:36 +03005853 &chr, 0))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005854 goto store;
5855 }
5856 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02005857 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005858
5859 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005860 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005861 message = "\\ at end of string";
5862 s--;
Serhiy Storchakad6793772013-01-29 10:20:44 +02005863 goto error;
Walter Dörwald8c077222002-03-25 11:16:18 +00005864 }
5865 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005866 WRITECHAR('\\');
Serhiy Storchaka73e38802013-01-25 23:52:21 +02005867 WRITECHAR((unsigned char)s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005868 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005869 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005870 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02005871 continue;
5872
5873 error:
5874 endinpos = s-starts;
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02005875 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchakad6793772013-01-29 10:20:44 +02005876 errors, &errorHandler,
5877 "unicodeescape", message,
5878 &starts, &end, &startinpos, &endinpos, &exc, &s,
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02005879 &writer))
Serhiy Storchakad6793772013-01-29 10:20:44 +02005880 goto onError;
5881 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005882 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005883#undef WRITECHAR
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005884
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005885 Py_XDECREF(errorHandler);
5886 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005887 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald8c077222002-03-25 11:16:18 +00005888
Benjamin Peterson29060642009-01-31 22:14:21 +00005889 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005890 PyErr_SetString(
5891 PyExc_UnicodeError,
5892 "\\N escapes not supported (can't load unicodedata module)"
5893 );
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005894 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005895 Py_XDECREF(errorHandler);
5896 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005897 return NULL;
5898
Benjamin Peterson29060642009-01-31 22:14:21 +00005899 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005900 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005901 Py_XDECREF(errorHandler);
5902 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005903 return NULL;
5904}
5905
5906/* Return a Unicode-Escape string version of the Unicode object.
5907
5908 If quotes is true, the string is enclosed in u"" or u'' quotes as
5909 appropriate.
5910
5911*/
5912
Alexander Belopolsky40018472011-02-26 01:02:56 +00005913PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005914PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005915{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005916 Py_ssize_t i, len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005917 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005918 char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005919 int kind;
5920 void *data;
5921 Py_ssize_t expandsize = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005922
Ezio Melottie7f90372012-10-05 03:33:31 +03005923 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00005924 escape.
5925
Ezio Melottie7f90372012-10-05 03:33:31 +03005926 For UCS1 strings it's '\xxx', 4 bytes per source character.
5927 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
5928 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00005929 */
5930
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005931 if (!PyUnicode_Check(unicode)) {
5932 PyErr_BadArgument();
5933 return NULL;
5934 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005935 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005936 return NULL;
5937 len = PyUnicode_GET_LENGTH(unicode);
5938 kind = PyUnicode_KIND(unicode);
5939 data = PyUnicode_DATA(unicode);
Benjamin Petersonead6b532011-12-20 17:23:42 -06005940 switch (kind) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005941 case PyUnicode_1BYTE_KIND: expandsize = 4; break;
5942 case PyUnicode_2BYTE_KIND: expandsize = 6; break;
5943 case PyUnicode_4BYTE_KIND: expandsize = 10; break;
5944 }
5945
5946 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005947 return PyBytes_FromStringAndSize(NULL, 0);
5948
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005949 if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005950 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005951
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005952 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005953 2
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005954 + expandsize*len
Benjamin Peterson29060642009-01-31 22:14:21 +00005955 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005956 if (repr == NULL)
5957 return NULL;
5958
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005959 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005960
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005961 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01005962 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005963
Walter Dörwald79e913e2007-05-12 11:08:06 +00005964 /* Escape backslashes */
5965 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005966 *p++ = '\\';
5967 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005968 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005969 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005970
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005971 /* Map 21-bit characters to '\U00xxxxxx' */
5972 else if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01005973 assert(ch <= MAX_UNICODE);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005974 *p++ = '\\';
5975 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005976 *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
5977 *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F];
5978 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
5979 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
5980 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
5981 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
5982 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
5983 *p++ = Py_hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00005984 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005985 }
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005986
Guido van Rossumd57fd912000-03-10 22:53:23 +00005987 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005988 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005989 *p++ = '\\';
5990 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005991 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
5992 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
5993 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5994 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005995 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005996
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005997 /* Map special whitespace to '\t', \n', '\r' */
5998 else if (ch == '\t') {
5999 *p++ = '\\';
6000 *p++ = 't';
6001 }
6002 else if (ch == '\n') {
6003 *p++ = '\\';
6004 *p++ = 'n';
6005 }
6006 else if (ch == '\r') {
6007 *p++ = '\\';
6008 *p++ = 'r';
6009 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006010
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006011 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00006012 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006013 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006014 *p++ = 'x';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006015 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6016 *p++ = Py_hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00006017 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006018
Guido van Rossumd57fd912000-03-10 22:53:23 +00006019 /* Copy everything else as-is */
6020 else
6021 *p++ = (char) ch;
6022 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006023
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006024 assert(p - PyBytes_AS_STRING(repr) > 0);
6025 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
6026 return NULL;
6027 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006028}
6029
Alexander Belopolsky40018472011-02-26 01:02:56 +00006030PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006031PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6032 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006033{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006034 PyObject *result;
6035 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6036 if (tmp == NULL)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006037 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006038 result = PyUnicode_AsUnicodeEscapeString(tmp);
6039 Py_DECREF(tmp);
6040 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006041}
6042
6043/* --- Raw Unicode Escape Codec ------------------------------------------- */
6044
Alexander Belopolsky40018472011-02-26 01:02:56 +00006045PyObject *
6046PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006047 Py_ssize_t size,
6048 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006049{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006050 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006051 Py_ssize_t startinpos;
6052 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006053 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006054 const char *end;
6055 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006056 PyObject *errorHandler = NULL;
6057 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006058
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006059 if (size == 0)
6060 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006061
Guido van Rossumd57fd912000-03-10 22:53:23 +00006062 /* Escaped strings will always be longer than the resulting
6063 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006064 length after conversion to the true value. (But decoding error
6065 handler might have to resize the string) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006066 _PyUnicodeWriter_Init(&writer);
6067 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006068
Guido van Rossumd57fd912000-03-10 22:53:23 +00006069 end = s + size;
6070 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006071 unsigned char c;
6072 Py_UCS4 x;
6073 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006074 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006075
Benjamin Peterson29060642009-01-31 22:14:21 +00006076 /* Non-escape characters are interpreted as Unicode ordinals */
6077 if (*s != '\\') {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006078 x = (unsigned char)*s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006079 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006080 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006081 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006082 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006083 startinpos = s-starts;
6084
6085 /* \u-escapes are only interpreted iff the number of leading
6086 backslashes if odd */
6087 bs = s;
6088 for (;s < end;) {
6089 if (*s != '\\')
6090 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006091 x = (unsigned char)*s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006092 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006093 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006094 }
6095 if (((s - bs) & 1) == 0 ||
6096 s >= end ||
6097 (*s != 'u' && *s != 'U')) {
6098 continue;
6099 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006100 writer.pos--;
Benjamin Peterson29060642009-01-31 22:14:21 +00006101 count = *s=='u' ? 4 : 8;
6102 s++;
6103
6104 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Benjamin Peterson29060642009-01-31 22:14:21 +00006105 for (x = 0, i = 0; i < count; ++i, ++s) {
6106 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00006107 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006108 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006109 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00006110 errors, &errorHandler,
6111 "rawunicodeescape", "truncated \\uXXXX",
6112 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006113 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006114 goto onError;
6115 goto nextByte;
6116 }
6117 x = (x<<4) & ~0xF;
6118 if (c >= '0' && c <= '9')
6119 x += c - '0';
6120 else if (c >= 'a' && c <= 'f')
6121 x += 10 + c - 'a';
6122 else
6123 x += 10 + c - 'A';
6124 }
Victor Stinner8faf8212011-12-08 22:14:11 +01006125 if (x <= MAX_UNICODE) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006126 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006127 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006128 }
6129 else {
Christian Heimesfe337bf2008-03-23 21:54:12 +00006130 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006131 if (unicode_decode_call_errorhandler_writer(
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006132 errors, &errorHandler,
6133 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00006134 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006135 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006136 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006137 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006138 nextByte:
6139 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006140 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006141 Py_XDECREF(errorHandler);
6142 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006143 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006144
Benjamin Peterson29060642009-01-31 22:14:21 +00006145 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006146 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006147 Py_XDECREF(errorHandler);
6148 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006149 return NULL;
6150}
6151
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006152
Alexander Belopolsky40018472011-02-26 01:02:56 +00006153PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006154PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006155{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006156 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006157 char *p;
6158 char *q;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006159 Py_ssize_t expandsize, pos;
6160 int kind;
6161 void *data;
6162 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006163
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006164 if (!PyUnicode_Check(unicode)) {
6165 PyErr_BadArgument();
6166 return NULL;
6167 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05006168 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006169 return NULL;
6170 kind = PyUnicode_KIND(unicode);
6171 data = PyUnicode_DATA(unicode);
6172 len = PyUnicode_GET_LENGTH(unicode);
Benjamin Peterson1518e872011-11-23 10:44:52 -06006173 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6174 bytes, and 1 byte characters 4. */
6175 expandsize = kind * 2 + 2;
Victor Stinner0e368262011-11-10 20:12:49 +01006176
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006177 if (len > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006178 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00006179
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006180 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006181 if (repr == NULL)
6182 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006183 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006184 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006185
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006186 p = q = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006187 for (pos = 0; pos < len; pos++) {
6188 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006189 /* Map 32-bit characters to '\Uxxxxxxxx' */
6190 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01006191 assert(ch <= MAX_UNICODE);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006192 *p++ = '\\';
6193 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006194 *p++ = Py_hexdigits[(ch >> 28) & 0xf];
6195 *p++ = Py_hexdigits[(ch >> 24) & 0xf];
6196 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6197 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6198 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6199 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6200 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6201 *p++ = Py_hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00006202 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006203 /* Map 16-bit characters to '\uxxxx' */
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006204 else if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006205 *p++ = '\\';
6206 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006207 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6208 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6209 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6210 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006211 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006212 /* Copy everything else as-is */
6213 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00006214 *p++ = (char) ch;
6215 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006216
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006217 assert(p > q);
6218 if (_PyBytes_Resize(&repr, p - q) < 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006219 return NULL;
6220 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006221}
6222
Alexander Belopolsky40018472011-02-26 01:02:56 +00006223PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006224PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6225 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006226{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006227 PyObject *result;
6228 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6229 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006230 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006231 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6232 Py_DECREF(tmp);
6233 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006234}
6235
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006236/* --- Unicode Internal Codec ------------------------------------------- */
6237
Alexander Belopolsky40018472011-02-26 01:02:56 +00006238PyObject *
6239_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006240 Py_ssize_t size,
6241 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006242{
6243 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006244 Py_ssize_t startinpos;
6245 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006246 _PyUnicodeWriter writer;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006247 const char *end;
6248 const char *reason;
6249 PyObject *errorHandler = NULL;
6250 PyObject *exc = NULL;
6251
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006252 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02006253 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006254 1))
6255 return NULL;
6256
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006257 if (size == 0)
6258 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006259
Victor Stinner8f674cc2013-04-17 23:02:17 +02006260 _PyUnicodeWriter_Init(&writer);
6261 if (size / Py_UNICODE_SIZE > PY_SSIZE_T_MAX - 1) {
6262 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00006263 goto onError;
Victor Stinner8f674cc2013-04-17 23:02:17 +02006264 }
6265 writer.min_length = (size + (Py_UNICODE_SIZE - 1)) / Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006266
Victor Stinner8f674cc2013-04-17 23:02:17 +02006267 end = s + size;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006268 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006269 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006270 Py_UCS4 ch;
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006271 if (end - s < Py_UNICODE_SIZE) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006272 endinpos = end-starts;
6273 reason = "truncated input";
6274 goto error;
6275 }
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006276 /* We copy the raw representation one byte at a time because the
6277 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006278 ((char *) &uch)[0] = s[0];
6279 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006280#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006281 ((char *) &uch)[2] = s[2];
6282 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006283#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006284 ch = uch;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006285#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006286 /* We have to sanity check the raw data, otherwise doom looms for
6287 some malformed UCS-4 data. */
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006288 if (ch > 0x10ffff) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006289 endinpos = s - starts + Py_UNICODE_SIZE;
6290 reason = "illegal code point (> 0x10FFFF)";
6291 goto error;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006292 }
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006293#endif
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006294 s += Py_UNICODE_SIZE;
6295#ifndef Py_UNICODE_WIDE
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006296 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && end - s >= Py_UNICODE_SIZE)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006297 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006298 Py_UNICODE uch2;
6299 ((char *) &uch2)[0] = s[0];
6300 ((char *) &uch2)[1] = s[1];
Victor Stinner551ac952011-11-29 22:58:13 +01006301 if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006302 {
Victor Stinner551ac952011-11-29 22:58:13 +01006303 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006304 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006305 }
6306 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006307#endif
6308
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006309 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006310 goto onError;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006311 continue;
6312
6313 error:
6314 startinpos = s - starts;
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006315 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006316 errors, &errorHandler,
6317 "unicode_internal", reason,
6318 &starts, &end, &startinpos, &endinpos, &exc, &s,
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006319 &writer))
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006320 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006321 }
6322
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006323 Py_XDECREF(errorHandler);
6324 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006325 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006326
Benjamin Peterson29060642009-01-31 22:14:21 +00006327 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006328 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006329 Py_XDECREF(errorHandler);
6330 Py_XDECREF(exc);
6331 return NULL;
6332}
6333
Guido van Rossumd57fd912000-03-10 22:53:23 +00006334/* --- Latin-1 Codec ------------------------------------------------------ */
6335
Alexander Belopolsky40018472011-02-26 01:02:56 +00006336PyObject *
6337PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006338 Py_ssize_t size,
6339 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006340{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006341 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006342 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006343}
6344
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006345/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006346static void
6347make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006348 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006349 PyObject *unicode,
6350 Py_ssize_t startpos, Py_ssize_t endpos,
6351 const char *reason)
6352{
6353 if (*exceptionObject == NULL) {
6354 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006355 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006356 encoding, unicode, startpos, endpos, reason);
6357 }
6358 else {
6359 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6360 goto onError;
6361 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6362 goto onError;
6363 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6364 goto onError;
6365 return;
6366 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02006367 Py_CLEAR(*exceptionObject);
Martin v. Löwis9e816682011-11-02 12:45:42 +01006368 }
6369}
6370
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006371/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006372static void
6373raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006374 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006375 PyObject *unicode,
6376 Py_ssize_t startpos, Py_ssize_t endpos,
6377 const char *reason)
6378{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006379 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006380 encoding, unicode, startpos, endpos, reason);
6381 if (*exceptionObject != NULL)
6382 PyCodec_StrictErrors(*exceptionObject);
6383}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006384
6385/* error handling callback helper:
6386 build arguments, call the callback and check the arguments,
6387 put the result into newpos and return the replacement string, which
6388 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006389static PyObject *
6390unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006391 PyObject **errorHandler,
6392 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006393 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006394 Py_ssize_t startpos, Py_ssize_t endpos,
6395 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006396{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006397 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006398 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006399 PyObject *restuple;
6400 PyObject *resunicode;
6401
6402 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006403 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006404 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006405 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006406 }
6407
Benjamin Petersonbac79492012-01-14 13:34:47 -05006408 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006409 return NULL;
6410 len = PyUnicode_GET_LENGTH(unicode);
6411
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006412 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006413 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006414 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006415 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006416
6417 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006418 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006419 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006420 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006421 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006422 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006423 Py_DECREF(restuple);
6424 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006425 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006426 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006427 &resunicode, newpos)) {
6428 Py_DECREF(restuple);
6429 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006430 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006431 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6432 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6433 Py_DECREF(restuple);
6434 return NULL;
6435 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006436 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006437 *newpos = len + *newpos;
6438 if (*newpos<0 || *newpos>len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006439 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6440 Py_DECREF(restuple);
6441 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006442 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006443 Py_INCREF(resunicode);
6444 Py_DECREF(restuple);
6445 return resunicode;
6446}
6447
Alexander Belopolsky40018472011-02-26 01:02:56 +00006448static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006449unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006450 const char *errors,
Victor Stinnerfcd96532011-11-04 00:28:50 +01006451 unsigned int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006452{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006453 /* input state */
6454 Py_ssize_t pos=0, size;
6455 int kind;
6456 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006457 /* output object */
6458 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006459 /* pointer into the output */
6460 char *str;
6461 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006462 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006463 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6464 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006465 PyObject *errorHandler = NULL;
6466 PyObject *exc = NULL;
6467 /* the following variable is used for caching string comparisons
6468 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6469 int known_errorHandler = -1;
6470
Benjamin Petersonbac79492012-01-14 13:34:47 -05006471 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006472 return NULL;
6473 size = PyUnicode_GET_LENGTH(unicode);
6474 kind = PyUnicode_KIND(unicode);
6475 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006476 /* allocate enough for a simple encoding without
6477 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006478 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006479 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006480 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006481 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006482 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006483 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006484 ressize = size;
6485
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006486 while (pos < size) {
6487 Py_UCS4 c = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006488
Benjamin Peterson29060642009-01-31 22:14:21 +00006489 /* can we encode this? */
6490 if (c<limit) {
6491 /* no overflow check, because we know that the space is enough */
6492 *str++ = (char)c;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006493 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006494 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006495 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006496 Py_ssize_t requiredsize;
6497 PyObject *repunicode;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006498 Py_ssize_t repsize, newpos, respos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006499 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006500 Py_ssize_t collstart = pos;
6501 Py_ssize_t collend = pos;
Benjamin Peterson29060642009-01-31 22:14:21 +00006502 /* find all unecodable characters */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006503 while ((collend < size) && (PyUnicode_READ(kind, data, collend)>=limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006504 ++collend;
6505 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6506 if (known_errorHandler==-1) {
6507 if ((errors==NULL) || (!strcmp(errors, "strict")))
6508 known_errorHandler = 1;
6509 else if (!strcmp(errors, "replace"))
6510 known_errorHandler = 2;
6511 else if (!strcmp(errors, "ignore"))
6512 known_errorHandler = 3;
6513 else if (!strcmp(errors, "xmlcharrefreplace"))
6514 known_errorHandler = 4;
6515 else
6516 known_errorHandler = 0;
6517 }
6518 switch (known_errorHandler) {
6519 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006520 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006521 goto onError;
6522 case 2: /* replace */
6523 while (collstart++<collend)
6524 *str++ = '?'; /* fall through */
6525 case 3: /* ignore */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006526 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006527 break;
6528 case 4: /* xmlcharrefreplace */
6529 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006530 /* determine replacement size */
6531 for (i = collstart, repsize = 0; i < collend; ++i) {
6532 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6533 if (ch < 10)
Benjamin Peterson29060642009-01-31 22:14:21 +00006534 repsize += 2+1+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006535 else if (ch < 100)
Benjamin Peterson29060642009-01-31 22:14:21 +00006536 repsize += 2+2+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006537 else if (ch < 1000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006538 repsize += 2+3+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006539 else if (ch < 10000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006540 repsize += 2+4+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006541 else if (ch < 100000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006542 repsize += 2+5+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006543 else if (ch < 1000000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006544 repsize += 2+6+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006545 else {
Victor Stinner8faf8212011-12-08 22:14:11 +01006546 assert(ch <= MAX_UNICODE);
Benjamin Peterson29060642009-01-31 22:14:21 +00006547 repsize += 2+7+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006548 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006549 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006550 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006551 if (requiredsize > ressize) {
6552 if (requiredsize<2*ressize)
6553 requiredsize = 2*ressize;
6554 if (_PyBytes_Resize(&res, requiredsize))
6555 goto onError;
6556 str = PyBytes_AS_STRING(res) + respos;
6557 ressize = requiredsize;
6558 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006559 /* generate replacement */
6560 for (i = collstart; i < collend; ++i) {
6561 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
Benjamin Peterson29060642009-01-31 22:14:21 +00006562 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006563 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006564 break;
6565 default:
6566 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006567 encoding, reason, unicode, &exc,
6568 collstart, collend, &newpos);
6569 if (repunicode == NULL || (PyUnicode_Check(repunicode) &&
Benjamin Petersonbac79492012-01-14 13:34:47 -05006570 PyUnicode_READY(repunicode) == -1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006571 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006572 if (PyBytes_Check(repunicode)) {
6573 /* Directly copy bytes result to output. */
6574 repsize = PyBytes_Size(repunicode);
6575 if (repsize > 1) {
6576 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006577 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006578 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6579 Py_DECREF(repunicode);
6580 goto onError;
6581 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006582 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006583 ressize += repsize-1;
6584 }
6585 memcpy(str, PyBytes_AsString(repunicode), repsize);
6586 str += repsize;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006587 pos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006588 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006589 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006590 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006591 /* need more space? (at least enough for what we
6592 have+the replacement+the rest of the string, so
6593 we won't have to check space for encodable characters) */
6594 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006595 repsize = PyUnicode_GET_LENGTH(repunicode);
6596 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006597 if (requiredsize > ressize) {
6598 if (requiredsize<2*ressize)
6599 requiredsize = 2*ressize;
6600 if (_PyBytes_Resize(&res, requiredsize)) {
6601 Py_DECREF(repunicode);
6602 goto onError;
6603 }
6604 str = PyBytes_AS_STRING(res) + respos;
6605 ressize = requiredsize;
6606 }
6607 /* check if there is anything unencodable in the replacement
6608 and copy it to the output */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006609 for (i = 0; repsize-->0; ++i, ++str) {
6610 c = PyUnicode_READ_CHAR(repunicode, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00006611 if (c >= limit) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006612 raise_encode_exception(&exc, encoding, unicode,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006613 pos, pos+1, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006614 Py_DECREF(repunicode);
6615 goto onError;
6616 }
6617 *str = (char)c;
6618 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006619 pos = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006620 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006621 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006622 }
6623 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006624 /* Resize if we allocated to much */
6625 size = str - PyBytes_AS_STRING(res);
6626 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006627 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006628 if (_PyBytes_Resize(&res, size) < 0)
6629 goto onError;
6630 }
6631
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006632 Py_XDECREF(errorHandler);
6633 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006634 return res;
6635
6636 onError:
6637 Py_XDECREF(res);
6638 Py_XDECREF(errorHandler);
6639 Py_XDECREF(exc);
6640 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006641}
6642
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006643/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006644PyObject *
6645PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006646 Py_ssize_t size,
6647 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006648{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006649 PyObject *result;
6650 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6651 if (unicode == NULL)
6652 return NULL;
6653 result = unicode_encode_ucs1(unicode, errors, 256);
6654 Py_DECREF(unicode);
6655 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006656}
6657
Alexander Belopolsky40018472011-02-26 01:02:56 +00006658PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006659_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006660{
6661 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006662 PyErr_BadArgument();
6663 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006664 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006665 if (PyUnicode_READY(unicode) == -1)
6666 return NULL;
6667 /* Fast path: if it is a one-byte string, construct
6668 bytes object directly. */
6669 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6670 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6671 PyUnicode_GET_LENGTH(unicode));
6672 /* Non-Latin-1 characters present. Defer to above function to
6673 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006674 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006675}
6676
6677PyObject*
6678PyUnicode_AsLatin1String(PyObject *unicode)
6679{
6680 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006681}
6682
6683/* --- 7-bit ASCII Codec -------------------------------------------------- */
6684
Alexander Belopolsky40018472011-02-26 01:02:56 +00006685PyObject *
6686PyUnicode_DecodeASCII(const char *s,
6687 Py_ssize_t size,
6688 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006689{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006690 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006691 _PyUnicodeWriter writer;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006692 int kind;
6693 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006694 Py_ssize_t startinpos;
6695 Py_ssize_t endinpos;
6696 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006697 const char *e;
6698 PyObject *errorHandler = NULL;
6699 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006700
Guido van Rossumd57fd912000-03-10 22:53:23 +00006701 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02006702 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006703
Guido van Rossumd57fd912000-03-10 22:53:23 +00006704 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006705 if (size == 1 && (unsigned char)s[0] < 128)
6706 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006707
Victor Stinner8f674cc2013-04-17 23:02:17 +02006708 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02006709 writer.min_length = size;
6710 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0)
Victor Stinner8f674cc2013-04-17 23:02:17 +02006711 return NULL;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006712
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006713 e = s + size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006714 data = writer.data;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006715 outpos = ascii_decode(s, e, (Py_UCS1 *)data);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006716 writer.pos = outpos;
6717 if (writer.pos == size)
6718 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006719
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006720 s += writer.pos;
6721 kind = writer.kind;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006722 while (s < e) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02006723 unsigned char c = (unsigned char)*s;
Benjamin Peterson29060642009-01-31 22:14:21 +00006724 if (c < 128) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006725 PyUnicode_WRITE(kind, data, writer.pos, c);
6726 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00006727 ++s;
6728 }
6729 else {
6730 startinpos = s-starts;
6731 endinpos = startinpos + 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006732 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00006733 errors, &errorHandler,
6734 "ascii", "ordinal not in range(128)",
6735 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006736 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006737 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006738 kind = writer.kind;
6739 data = writer.data;
Benjamin Peterson29060642009-01-31 22:14:21 +00006740 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006741 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006742 Py_XDECREF(errorHandler);
6743 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006744 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006745
Benjamin Peterson29060642009-01-31 22:14:21 +00006746 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006747 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006748 Py_XDECREF(errorHandler);
6749 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006750 return NULL;
6751}
6752
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006753/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006754PyObject *
6755PyUnicode_EncodeASCII(const Py_UNICODE *p,
6756 Py_ssize_t size,
6757 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006758{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006759 PyObject *result;
6760 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6761 if (unicode == NULL)
6762 return NULL;
6763 result = unicode_encode_ucs1(unicode, errors, 128);
6764 Py_DECREF(unicode);
6765 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006766}
6767
Alexander Belopolsky40018472011-02-26 01:02:56 +00006768PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006769_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006770{
6771 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006772 PyErr_BadArgument();
6773 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006774 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006775 if (PyUnicode_READY(unicode) == -1)
6776 return NULL;
6777 /* Fast path: if it is an ASCII-only string, construct bytes object
6778 directly. Else defer to above function to raise the exception. */
Victor Stinneraf037572013-04-14 18:44:10 +02006779 if (PyUnicode_IS_ASCII(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006780 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6781 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006782 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006783}
6784
6785PyObject *
6786PyUnicode_AsASCIIString(PyObject *unicode)
6787{
6788 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006789}
6790
Victor Stinner99b95382011-07-04 14:23:54 +02006791#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006792
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006793/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006794
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006795#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006796#define NEED_RETRY
6797#endif
6798
Victor Stinner3a50e702011-10-18 21:21:00 +02006799#ifndef WC_ERR_INVALID_CHARS
6800# define WC_ERR_INVALID_CHARS 0x0080
6801#endif
6802
6803static char*
6804code_page_name(UINT code_page, PyObject **obj)
6805{
6806 *obj = NULL;
6807 if (code_page == CP_ACP)
6808 return "mbcs";
6809 if (code_page == CP_UTF7)
6810 return "CP_UTF7";
6811 if (code_page == CP_UTF8)
6812 return "CP_UTF8";
6813
6814 *obj = PyBytes_FromFormat("cp%u", code_page);
6815 if (*obj == NULL)
6816 return NULL;
6817 return PyBytes_AS_STRING(*obj);
6818}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006819
Victor Stinner3a50e702011-10-18 21:21:00 +02006820static DWORD
6821decode_code_page_flags(UINT code_page)
6822{
6823 if (code_page == CP_UTF7) {
6824 /* The CP_UTF7 decoder only supports flags=0 */
6825 return 0;
6826 }
6827 else
6828 return MB_ERR_INVALID_CHARS;
6829}
6830
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006831/*
Victor Stinner3a50e702011-10-18 21:21:00 +02006832 * Decode a byte string from a Windows code page into unicode object in strict
6833 * mode.
6834 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02006835 * Returns consumed size if succeed, returns -2 on decode error, or raise an
6836 * OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006837 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006838static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006839decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006840 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02006841 const char *in,
6842 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006843{
Victor Stinner3a50e702011-10-18 21:21:00 +02006844 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01006845 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02006846 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006847
6848 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02006849 assert(insize > 0);
6850 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
6851 if (outsize <= 0)
6852 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006853
6854 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006855 /* Create unicode object */
Victor Stinnerab595942011-12-17 04:59:06 +01006856 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01006857 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00006858 if (*v == NULL)
6859 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006860 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006861 }
6862 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006863 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02006864 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner16e6a802011-12-12 13:24:15 +01006865 if (unicode_resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006866 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006867 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006868 }
6869
6870 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02006871 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
6872 if (outsize <= 0)
6873 goto error;
6874 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00006875
Victor Stinner3a50e702011-10-18 21:21:00 +02006876error:
6877 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
6878 return -2;
6879 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00006880 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006881}
6882
Victor Stinner3a50e702011-10-18 21:21:00 +02006883/*
6884 * Decode a byte string from a code page into unicode object with an error
6885 * handler.
6886 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02006887 * Returns consumed size if succeed, or raise an OSError or
Victor Stinner3a50e702011-10-18 21:21:00 +02006888 * UnicodeDecodeError exception and returns -1 on error.
6889 */
6890static int
6891decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006892 PyObject **v,
6893 const char *in, const int size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01006894 const char *errors, int final)
Victor Stinner3a50e702011-10-18 21:21:00 +02006895{
6896 const char *startin = in;
6897 const char *endin = in + size;
6898 const DWORD flags = decode_code_page_flags(code_page);
6899 /* Ideally, we should get reason from FormatMessage. This is the Windows
6900 2000 English version of the message. */
6901 const char *reason = "No mapping for the Unicode character exists "
6902 "in the target code page.";
6903 /* each step cannot decode more than 1 character, but a character can be
6904 represented as a surrogate pair */
6905 wchar_t buffer[2], *startout, *out;
Victor Stinner9f067f42013-06-05 00:21:31 +02006906 int insize;
6907 Py_ssize_t outsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02006908 PyObject *errorHandler = NULL;
6909 PyObject *exc = NULL;
6910 PyObject *encoding_obj = NULL;
6911 char *encoding;
6912 DWORD err;
6913 int ret = -1;
6914
6915 assert(size > 0);
6916
6917 encoding = code_page_name(code_page, &encoding_obj);
6918 if (encoding == NULL)
6919 return -1;
6920
Victor Stinner7d00cc12014-03-17 23:08:06 +01006921 if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
Victor Stinner3a50e702011-10-18 21:21:00 +02006922 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
6923 UnicodeDecodeError. */
6924 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
6925 if (exc != NULL) {
6926 PyCodec_StrictErrors(exc);
6927 Py_CLEAR(exc);
6928 }
6929 goto error;
6930 }
6931
6932 if (*v == NULL) {
6933 /* Create unicode object */
6934 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6935 PyErr_NoMemory();
6936 goto error;
6937 }
Victor Stinnerab595942011-12-17 04:59:06 +01006938 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01006939 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02006940 if (*v == NULL)
6941 goto error;
6942 startout = PyUnicode_AS_UNICODE(*v);
6943 }
6944 else {
6945 /* Extend unicode object */
6946 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
6947 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6948 PyErr_NoMemory();
6949 goto error;
6950 }
Victor Stinner16e6a802011-12-12 13:24:15 +01006951 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02006952 goto error;
6953 startout = PyUnicode_AS_UNICODE(*v) + n;
6954 }
6955
6956 /* Decode the byte string character per character */
6957 out = startout;
6958 while (in < endin)
6959 {
6960 /* Decode a character */
6961 insize = 1;
6962 do
6963 {
6964 outsize = MultiByteToWideChar(code_page, flags,
6965 in, insize,
6966 buffer, Py_ARRAY_LENGTH(buffer));
6967 if (outsize > 0)
6968 break;
6969 err = GetLastError();
6970 if (err != ERROR_NO_UNICODE_TRANSLATION
6971 && err != ERROR_INSUFFICIENT_BUFFER)
6972 {
6973 PyErr_SetFromWindowsErr(0);
6974 goto error;
6975 }
6976 insize++;
6977 }
6978 /* 4=maximum length of a UTF-8 sequence */
6979 while (insize <= 4 && (in + insize) <= endin);
6980
6981 if (outsize <= 0) {
6982 Py_ssize_t startinpos, endinpos, outpos;
6983
Victor Stinner7d00cc12014-03-17 23:08:06 +01006984 /* last character in partial decode? */
6985 if (in + insize >= endin && !final)
6986 break;
6987
Victor Stinner3a50e702011-10-18 21:21:00 +02006988 startinpos = in - startin;
6989 endinpos = startinpos + 1;
6990 outpos = out - PyUnicode_AS_UNICODE(*v);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006991 if (unicode_decode_call_errorhandler_wchar(
Victor Stinner3a50e702011-10-18 21:21:00 +02006992 errors, &errorHandler,
6993 encoding, reason,
6994 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01006995 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02006996 {
6997 goto error;
6998 }
Victor Stinner596a6c42011-11-09 00:02:18 +01006999 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007000 }
7001 else {
7002 in += insize;
7003 memcpy(out, buffer, outsize * sizeof(wchar_t));
7004 out += outsize;
7005 }
7006 }
7007
7008 /* write a NUL character at the end */
7009 *out = 0;
7010
7011 /* Extend unicode object */
7012 outsize = out - startout;
7013 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner16e6a802011-12-12 13:24:15 +01007014 if (unicode_resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007015 goto error;
Victor Stinner7d00cc12014-03-17 23:08:06 +01007016 ret = in - startin;
Victor Stinner3a50e702011-10-18 21:21:00 +02007017
7018error:
7019 Py_XDECREF(encoding_obj);
7020 Py_XDECREF(errorHandler);
7021 Py_XDECREF(exc);
7022 return ret;
7023}
7024
Victor Stinner3a50e702011-10-18 21:21:00 +02007025static PyObject *
7026decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007027 const char *s, Py_ssize_t size,
7028 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007029{
Victor Stinner76a31a62011-11-04 00:05:13 +01007030 PyObject *v = NULL;
7031 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007032
Victor Stinner3a50e702011-10-18 21:21:00 +02007033 if (code_page < 0) {
7034 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7035 return NULL;
7036 }
7037
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007038 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007039 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007040
Victor Stinner76a31a62011-11-04 00:05:13 +01007041 do
7042 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007043#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01007044 if (size > INT_MAX) {
7045 chunk_size = INT_MAX;
7046 final = 0;
7047 done = 0;
7048 }
7049 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007050#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007051 {
7052 chunk_size = (int)size;
7053 final = (consumed == NULL);
7054 done = 1;
7055 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007056
Victor Stinner76a31a62011-11-04 00:05:13 +01007057 if (chunk_size == 0 && done) {
7058 if (v != NULL)
7059 break;
Serhiy Storchaka678db842013-01-26 12:16:36 +02007060 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner76a31a62011-11-04 00:05:13 +01007061 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007062
Victor Stinner76a31a62011-11-04 00:05:13 +01007063 converted = decode_code_page_strict(code_page, &v,
7064 s, chunk_size);
7065 if (converted == -2)
7066 converted = decode_code_page_errors(code_page, &v,
7067 s, chunk_size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007068 errors, final);
7069 assert(converted != 0 || done);
Victor Stinner76a31a62011-11-04 00:05:13 +01007070
7071 if (converted < 0) {
7072 Py_XDECREF(v);
7073 return NULL;
7074 }
7075
7076 if (consumed)
7077 *consumed += converted;
7078
7079 s += converted;
7080 size -= converted;
7081 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007082
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007083 return unicode_result(v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007084}
7085
Alexander Belopolsky40018472011-02-26 01:02:56 +00007086PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007087PyUnicode_DecodeCodePageStateful(int code_page,
7088 const char *s,
7089 Py_ssize_t size,
7090 const char *errors,
7091 Py_ssize_t *consumed)
7092{
7093 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7094}
7095
7096PyObject *
7097PyUnicode_DecodeMBCSStateful(const char *s,
7098 Py_ssize_t size,
7099 const char *errors,
7100 Py_ssize_t *consumed)
7101{
7102 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7103}
7104
7105PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007106PyUnicode_DecodeMBCS(const char *s,
7107 Py_ssize_t size,
7108 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007109{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007110 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7111}
7112
Victor Stinner3a50e702011-10-18 21:21:00 +02007113static DWORD
7114encode_code_page_flags(UINT code_page, const char *errors)
7115{
7116 if (code_page == CP_UTF8) {
7117 if (winver.dwMajorVersion >= 6)
7118 /* CP_UTF8 supports WC_ERR_INVALID_CHARS on Windows Vista
7119 and later */
7120 return WC_ERR_INVALID_CHARS;
7121 else
7122 /* CP_UTF8 only supports flags=0 on Windows older than Vista */
7123 return 0;
7124 }
7125 else if (code_page == CP_UTF7) {
7126 /* CP_UTF7 only supports flags=0 */
7127 return 0;
7128 }
7129 else {
7130 if (errors != NULL && strcmp(errors, "replace") == 0)
7131 return 0;
7132 else
7133 return WC_NO_BEST_FIT_CHARS;
7134 }
7135}
7136
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007137/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007138 * Encode a Unicode string to a Windows code page into a byte string in strict
7139 * mode.
7140 *
7141 * Returns consumed characters if succeed, returns -2 on encode error, or raise
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007142 * an OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007143 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007144static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007145encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007146 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007147 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007148{
Victor Stinner554f3f02010-06-16 23:33:54 +00007149 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007150 BOOL *pusedDefaultChar = &usedDefaultChar;
7151 int outsize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007152 PyObject *exc = NULL;
Victor Stinner24729f32011-11-10 20:31:37 +01007153 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007154 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007155 const DWORD flags = encode_code_page_flags(code_page, NULL);
7156 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007157 /* Create a substring so that we can get the UTF-16 representation
7158 of just the slice under consideration. */
7159 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007160
Martin v. Löwis3d325192011-11-04 18:23:06 +01007161 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007162
Victor Stinner3a50e702011-10-18 21:21:00 +02007163 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007164 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007165 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007166 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007167
Victor Stinner2fc507f2011-11-04 20:06:39 +01007168 substring = PyUnicode_Substring(unicode, offset, offset+len);
7169 if (substring == NULL)
7170 return -1;
7171 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7172 if (p == NULL) {
7173 Py_DECREF(substring);
7174 return -1;
7175 }
Victor Stinner9f067f42013-06-05 00:21:31 +02007176 assert(size <= INT_MAX);
Martin v. Löwis3d325192011-11-04 18:23:06 +01007177
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007178 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007179 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007180 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007181 NULL, 0,
7182 NULL, pusedDefaultChar);
7183 if (outsize <= 0)
7184 goto error;
7185 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007186 if (pusedDefaultChar && *pusedDefaultChar) {
7187 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007188 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007189 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007190
Victor Stinner3a50e702011-10-18 21:21:00 +02007191 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007192 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007193 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007194 if (*outbytes == NULL) {
7195 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007196 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007197 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007198 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007199 }
7200 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007201 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007202 const Py_ssize_t n = PyBytes_Size(*outbytes);
7203 if (outsize > PY_SSIZE_T_MAX - n) {
7204 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007205 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007206 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007207 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007208 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7209 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007210 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007211 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007212 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007213 }
7214
7215 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007216 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007217 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007218 out, outsize,
7219 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007220 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007221 if (outsize <= 0)
7222 goto error;
7223 if (pusedDefaultChar && *pusedDefaultChar)
7224 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007225 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007226
Victor Stinner3a50e702011-10-18 21:21:00 +02007227error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007228 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007229 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7230 return -2;
7231 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007232 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007233}
7234
Victor Stinner3a50e702011-10-18 21:21:00 +02007235/*
7236 * Encode a Unicode string to a Windows code page into a byte string using a
7237 * error handler.
7238 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007239 * Returns consumed characters if succeed, or raise an OSError and returns
Victor Stinner3a50e702011-10-18 21:21:00 +02007240 * -1 on other error.
7241 */
7242static int
7243encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007244 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007245 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007246{
Victor Stinner3a50e702011-10-18 21:21:00 +02007247 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007248 Py_ssize_t pos = unicode_offset;
7249 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007250 /* Ideally, we should get reason from FormatMessage. This is the Windows
7251 2000 English version of the message. */
7252 const char *reason = "invalid character";
7253 /* 4=maximum length of a UTF-8 sequence */
7254 char buffer[4];
7255 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7256 Py_ssize_t outsize;
7257 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007258 PyObject *errorHandler = NULL;
7259 PyObject *exc = NULL;
7260 PyObject *encoding_obj = NULL;
7261 char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007262 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007263 PyObject *rep;
7264 int ret = -1;
7265
7266 assert(insize > 0);
7267
7268 encoding = code_page_name(code_page, &encoding_obj);
7269 if (encoding == NULL)
7270 return -1;
7271
7272 if (errors == NULL || strcmp(errors, "strict") == 0) {
7273 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7274 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007275 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007276 if (exc != NULL) {
7277 PyCodec_StrictErrors(exc);
7278 Py_DECREF(exc);
7279 }
7280 Py_XDECREF(encoding_obj);
7281 return -1;
7282 }
7283
7284 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7285 pusedDefaultChar = &usedDefaultChar;
7286 else
7287 pusedDefaultChar = NULL;
7288
7289 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7290 PyErr_NoMemory();
7291 goto error;
7292 }
7293 outsize = insize * Py_ARRAY_LENGTH(buffer);
7294
7295 if (*outbytes == NULL) {
7296 /* Create string object */
7297 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7298 if (*outbytes == NULL)
7299 goto error;
7300 out = PyBytes_AS_STRING(*outbytes);
7301 }
7302 else {
7303 /* Extend string object */
7304 Py_ssize_t n = PyBytes_Size(*outbytes);
7305 if (n > PY_SSIZE_T_MAX - outsize) {
7306 PyErr_NoMemory();
7307 goto error;
7308 }
7309 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7310 goto error;
7311 out = PyBytes_AS_STRING(*outbytes) + n;
7312 }
7313
7314 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007315 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007316 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007317 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7318 wchar_t chars[2];
7319 int charsize;
7320 if (ch < 0x10000) {
7321 chars[0] = (wchar_t)ch;
7322 charsize = 1;
7323 }
7324 else {
Victor Stinner76df43d2012-10-30 01:42:39 +01007325 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7326 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007327 charsize = 2;
7328 }
7329
Victor Stinner3a50e702011-10-18 21:21:00 +02007330 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007331 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007332 buffer, Py_ARRAY_LENGTH(buffer),
7333 NULL, pusedDefaultChar);
7334 if (outsize > 0) {
7335 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7336 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007337 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007338 memcpy(out, buffer, outsize);
7339 out += outsize;
7340 continue;
7341 }
7342 }
7343 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7344 PyErr_SetFromWindowsErr(0);
7345 goto error;
7346 }
7347
Victor Stinner3a50e702011-10-18 21:21:00 +02007348 rep = unicode_encode_call_errorhandler(
7349 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007350 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007351 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007352 if (rep == NULL)
7353 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007354 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007355
7356 if (PyBytes_Check(rep)) {
7357 outsize = PyBytes_GET_SIZE(rep);
7358 if (outsize != 1) {
7359 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7360 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7361 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7362 Py_DECREF(rep);
7363 goto error;
7364 }
7365 out = PyBytes_AS_STRING(*outbytes) + offset;
7366 }
7367 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7368 out += outsize;
7369 }
7370 else {
7371 Py_ssize_t i;
7372 enum PyUnicode_Kind kind;
7373 void *data;
7374
Benjamin Petersonbac79492012-01-14 13:34:47 -05007375 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007376 Py_DECREF(rep);
7377 goto error;
7378 }
7379
7380 outsize = PyUnicode_GET_LENGTH(rep);
7381 if (outsize != 1) {
7382 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7383 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7384 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7385 Py_DECREF(rep);
7386 goto error;
7387 }
7388 out = PyBytes_AS_STRING(*outbytes) + offset;
7389 }
7390 kind = PyUnicode_KIND(rep);
7391 data = PyUnicode_DATA(rep);
7392 for (i=0; i < outsize; i++) {
7393 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7394 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007395 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007396 encoding, unicode,
7397 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007398 "unable to encode error handler result to ASCII");
7399 Py_DECREF(rep);
7400 goto error;
7401 }
7402 *out = (unsigned char)ch;
7403 out++;
7404 }
7405 }
7406 Py_DECREF(rep);
7407 }
7408 /* write a NUL byte */
7409 *out = 0;
7410 outsize = out - PyBytes_AS_STRING(*outbytes);
7411 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7412 if (_PyBytes_Resize(outbytes, outsize) < 0)
7413 goto error;
7414 ret = 0;
7415
7416error:
7417 Py_XDECREF(encoding_obj);
7418 Py_XDECREF(errorHandler);
7419 Py_XDECREF(exc);
7420 return ret;
7421}
7422
Victor Stinner3a50e702011-10-18 21:21:00 +02007423static PyObject *
7424encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007425 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007426 const char *errors)
7427{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007428 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007429 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007430 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007431 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007432
Benjamin Petersonbac79492012-01-14 13:34:47 -05007433 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007434 return NULL;
7435 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007436
Victor Stinner3a50e702011-10-18 21:21:00 +02007437 if (code_page < 0) {
7438 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7439 return NULL;
7440 }
7441
Martin v. Löwis3d325192011-11-04 18:23:06 +01007442 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007443 return PyBytes_FromStringAndSize(NULL, 0);
7444
Victor Stinner7581cef2011-11-03 22:32:33 +01007445 offset = 0;
7446 do
7447 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007448#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007449 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007450 chunks. */
7451 if (len > INT_MAX/2) {
7452 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007453 done = 0;
7454 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007455 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007456#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007457 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007458 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007459 done = 1;
7460 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007461
Victor Stinner76a31a62011-11-04 00:05:13 +01007462 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007463 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007464 errors);
7465 if (ret == -2)
7466 ret = encode_code_page_errors(code_page, &outbytes,
7467 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007468 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007469 if (ret < 0) {
7470 Py_XDECREF(outbytes);
7471 return NULL;
7472 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007473
Victor Stinner7581cef2011-11-03 22:32:33 +01007474 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007475 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007476 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007477
Victor Stinner3a50e702011-10-18 21:21:00 +02007478 return outbytes;
7479}
7480
7481PyObject *
7482PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7483 Py_ssize_t size,
7484 const char *errors)
7485{
Victor Stinner7581cef2011-11-03 22:32:33 +01007486 PyObject *unicode, *res;
7487 unicode = PyUnicode_FromUnicode(p, size);
7488 if (unicode == NULL)
7489 return NULL;
7490 res = encode_code_page(CP_ACP, unicode, errors);
7491 Py_DECREF(unicode);
7492 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007493}
7494
7495PyObject *
7496PyUnicode_EncodeCodePage(int code_page,
7497 PyObject *unicode,
7498 const char *errors)
7499{
Victor Stinner7581cef2011-11-03 22:32:33 +01007500 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007501}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007502
Alexander Belopolsky40018472011-02-26 01:02:56 +00007503PyObject *
7504PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007505{
7506 if (!PyUnicode_Check(unicode)) {
7507 PyErr_BadArgument();
7508 return NULL;
7509 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007510 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007511}
7512
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007513#undef NEED_RETRY
7514
Victor Stinner99b95382011-07-04 14:23:54 +02007515#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007516
Guido van Rossumd57fd912000-03-10 22:53:23 +00007517/* --- Character Mapping Codec -------------------------------------------- */
7518
Victor Stinnerfb161b12013-04-18 01:44:27 +02007519static int
7520charmap_decode_string(const char *s,
7521 Py_ssize_t size,
7522 PyObject *mapping,
7523 const char *errors,
7524 _PyUnicodeWriter *writer)
7525{
7526 const char *starts = s;
7527 const char *e;
7528 Py_ssize_t startinpos, endinpos;
7529 PyObject *errorHandler = NULL, *exc = NULL;
7530 Py_ssize_t maplen;
7531 enum PyUnicode_Kind mapkind;
7532 void *mapdata;
7533 Py_UCS4 x;
7534 unsigned char ch;
7535
7536 if (PyUnicode_READY(mapping) == -1)
7537 return -1;
7538
7539 maplen = PyUnicode_GET_LENGTH(mapping);
7540 mapdata = PyUnicode_DATA(mapping);
7541 mapkind = PyUnicode_KIND(mapping);
7542
7543 e = s + size;
7544
7545 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
7546 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
7547 * is disabled in encoding aliases, latin1 is preferred because
7548 * its implementation is faster. */
7549 Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata;
7550 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7551 Py_UCS4 maxchar = writer->maxchar;
7552
7553 assert (writer->kind == PyUnicode_1BYTE_KIND);
7554 while (s < e) {
7555 ch = *s;
7556 x = mapdata_ucs1[ch];
7557 if (x > maxchar) {
7558 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
7559 goto onError;
7560 maxchar = writer->maxchar;
7561 outdata = (Py_UCS1 *)writer->data;
7562 }
7563 outdata[writer->pos] = x;
7564 writer->pos++;
7565 ++s;
7566 }
7567 return 0;
7568 }
7569
7570 while (s < e) {
7571 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
7572 enum PyUnicode_Kind outkind = writer->kind;
7573 Py_UCS2 *mapdata_ucs2 = (Py_UCS2 *)mapdata;
7574 if (outkind == PyUnicode_1BYTE_KIND) {
7575 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7576 Py_UCS4 maxchar = writer->maxchar;
7577 while (s < e) {
7578 ch = *s;
7579 x = mapdata_ucs2[ch];
7580 if (x > maxchar)
7581 goto Error;
7582 outdata[writer->pos] = x;
7583 writer->pos++;
7584 ++s;
7585 }
7586 break;
7587 }
7588 else if (outkind == PyUnicode_2BYTE_KIND) {
7589 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
7590 while (s < e) {
7591 ch = *s;
7592 x = mapdata_ucs2[ch];
7593 if (x == 0xFFFE)
7594 goto Error;
7595 outdata[writer->pos] = x;
7596 writer->pos++;
7597 ++s;
7598 }
7599 break;
7600 }
7601 }
7602 ch = *s;
7603
7604 if (ch < maplen)
7605 x = PyUnicode_READ(mapkind, mapdata, ch);
7606 else
7607 x = 0xfffe; /* invalid value */
7608Error:
7609 if (x == 0xfffe)
7610 {
7611 /* undefined mapping */
7612 startinpos = s-starts;
7613 endinpos = startinpos+1;
7614 if (unicode_decode_call_errorhandler_writer(
7615 errors, &errorHandler,
7616 "charmap", "character maps to <undefined>",
7617 &starts, &e, &startinpos, &endinpos, &exc, &s,
7618 writer)) {
7619 goto onError;
7620 }
7621 continue;
7622 }
7623
7624 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
7625 goto onError;
7626 ++s;
7627 }
7628 Py_XDECREF(errorHandler);
7629 Py_XDECREF(exc);
7630 return 0;
7631
7632onError:
7633 Py_XDECREF(errorHandler);
7634 Py_XDECREF(exc);
7635 return -1;
7636}
7637
7638static int
7639charmap_decode_mapping(const char *s,
7640 Py_ssize_t size,
7641 PyObject *mapping,
7642 const char *errors,
7643 _PyUnicodeWriter *writer)
7644{
7645 const char *starts = s;
7646 const char *e;
7647 Py_ssize_t startinpos, endinpos;
7648 PyObject *errorHandler = NULL, *exc = NULL;
7649 unsigned char ch;
Victor Stinnerf4f24242013-05-07 01:01:31 +02007650 PyObject *key, *item = NULL;
Victor Stinnerfb161b12013-04-18 01:44:27 +02007651
7652 e = s + size;
7653
7654 while (s < e) {
7655 ch = *s;
7656
7657 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7658 key = PyLong_FromLong((long)ch);
7659 if (key == NULL)
7660 goto onError;
7661
7662 item = PyObject_GetItem(mapping, key);
7663 Py_DECREF(key);
7664 if (item == NULL) {
7665 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7666 /* No mapping found means: mapping is undefined. */
7667 PyErr_Clear();
7668 goto Undefined;
7669 } else
7670 goto onError;
7671 }
7672
7673 /* Apply mapping */
7674 if (item == Py_None)
7675 goto Undefined;
7676 if (PyLong_Check(item)) {
7677 long value = PyLong_AS_LONG(item);
7678 if (value == 0xFFFE)
7679 goto Undefined;
7680 if (value < 0 || value > MAX_UNICODE) {
7681 PyErr_Format(PyExc_TypeError,
7682 "character mapping must be in range(0x%lx)",
7683 (unsigned long)MAX_UNICODE + 1);
7684 goto onError;
7685 }
7686
7687 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7688 goto onError;
7689 }
7690 else if (PyUnicode_Check(item)) {
7691 if (PyUnicode_READY(item) == -1)
7692 goto onError;
7693 if (PyUnicode_GET_LENGTH(item) == 1) {
7694 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
7695 if (value == 0xFFFE)
7696 goto Undefined;
7697 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7698 goto onError;
7699 }
7700 else {
7701 writer->overallocate = 1;
7702 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
7703 goto onError;
7704 }
7705 }
7706 else {
7707 /* wrong return value */
7708 PyErr_SetString(PyExc_TypeError,
7709 "character mapping must return integer, None or str");
7710 goto onError;
7711 }
7712 Py_CLEAR(item);
7713 ++s;
7714 continue;
7715
7716Undefined:
7717 /* undefined mapping */
7718 Py_CLEAR(item);
7719 startinpos = s-starts;
7720 endinpos = startinpos+1;
7721 if (unicode_decode_call_errorhandler_writer(
7722 errors, &errorHandler,
7723 "charmap", "character maps to <undefined>",
7724 &starts, &e, &startinpos, &endinpos, &exc, &s,
7725 writer)) {
7726 goto onError;
7727 }
7728 }
7729 Py_XDECREF(errorHandler);
7730 Py_XDECREF(exc);
7731 return 0;
7732
7733onError:
7734 Py_XDECREF(item);
7735 Py_XDECREF(errorHandler);
7736 Py_XDECREF(exc);
7737 return -1;
7738}
7739
Alexander Belopolsky40018472011-02-26 01:02:56 +00007740PyObject *
7741PyUnicode_DecodeCharmap(const char *s,
7742 Py_ssize_t size,
7743 PyObject *mapping,
7744 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007745{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007746 _PyUnicodeWriter writer;
Tim Petersced69f82003-09-16 20:30:58 +00007747
Guido van Rossumd57fd912000-03-10 22:53:23 +00007748 /* Default to Latin-1 */
7749 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007750 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007751
Guido van Rossumd57fd912000-03-10 22:53:23 +00007752 if (size == 0)
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02007753 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner8f674cc2013-04-17 23:02:17 +02007754 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02007755 writer.min_length = size;
7756 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007757 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007758
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007759 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerfb161b12013-04-18 01:44:27 +02007760 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
7761 goto onError;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007762 }
7763 else {
Victor Stinnerfb161b12013-04-18 01:44:27 +02007764 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
7765 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007766 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007767 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00007768
Benjamin Peterson29060642009-01-31 22:14:21 +00007769 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007770 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007771 return NULL;
7772}
7773
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007774/* Charmap encoding: the lookup table */
7775
Alexander Belopolsky40018472011-02-26 01:02:56 +00007776struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00007777 PyObject_HEAD
7778 unsigned char level1[32];
7779 int count2, count3;
7780 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007781};
7782
7783static PyObject*
7784encoding_map_size(PyObject *obj, PyObject* args)
7785{
7786 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007787 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00007788 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007789}
7790
7791static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007792 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00007793 PyDoc_STR("Return the size (in bytes) of this object") },
7794 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007795};
7796
7797static void
7798encoding_map_dealloc(PyObject* o)
7799{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007800 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007801}
7802
7803static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007804 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007805 "EncodingMap", /*tp_name*/
7806 sizeof(struct encoding_map), /*tp_basicsize*/
7807 0, /*tp_itemsize*/
7808 /* methods */
7809 encoding_map_dealloc, /*tp_dealloc*/
7810 0, /*tp_print*/
7811 0, /*tp_getattr*/
7812 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00007813 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00007814 0, /*tp_repr*/
7815 0, /*tp_as_number*/
7816 0, /*tp_as_sequence*/
7817 0, /*tp_as_mapping*/
7818 0, /*tp_hash*/
7819 0, /*tp_call*/
7820 0, /*tp_str*/
7821 0, /*tp_getattro*/
7822 0, /*tp_setattro*/
7823 0, /*tp_as_buffer*/
7824 Py_TPFLAGS_DEFAULT, /*tp_flags*/
7825 0, /*tp_doc*/
7826 0, /*tp_traverse*/
7827 0, /*tp_clear*/
7828 0, /*tp_richcompare*/
7829 0, /*tp_weaklistoffset*/
7830 0, /*tp_iter*/
7831 0, /*tp_iternext*/
7832 encoding_map_methods, /*tp_methods*/
7833 0, /*tp_members*/
7834 0, /*tp_getset*/
7835 0, /*tp_base*/
7836 0, /*tp_dict*/
7837 0, /*tp_descr_get*/
7838 0, /*tp_descr_set*/
7839 0, /*tp_dictoffset*/
7840 0, /*tp_init*/
7841 0, /*tp_alloc*/
7842 0, /*tp_new*/
7843 0, /*tp_free*/
7844 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007845};
7846
7847PyObject*
7848PyUnicode_BuildEncodingMap(PyObject* string)
7849{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007850 PyObject *result;
7851 struct encoding_map *mresult;
7852 int i;
7853 int need_dict = 0;
7854 unsigned char level1[32];
7855 unsigned char level2[512];
7856 unsigned char *mlevel1, *mlevel2, *mlevel3;
7857 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007858 int kind;
7859 void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007860 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007861 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007862
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007863 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007864 PyErr_BadArgument();
7865 return NULL;
7866 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007867 kind = PyUnicode_KIND(string);
7868 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007869 length = PyUnicode_GET_LENGTH(string);
7870 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007871 memset(level1, 0xFF, sizeof level1);
7872 memset(level2, 0xFF, sizeof level2);
7873
7874 /* If there isn't a one-to-one mapping of NULL to \0,
7875 or if there are non-BMP characters, we need to use
7876 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007877 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007878 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007879 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007880 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007881 ch = PyUnicode_READ(kind, data, i);
7882 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007883 need_dict = 1;
7884 break;
7885 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007886 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007887 /* unmapped character */
7888 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007889 l1 = ch >> 11;
7890 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007891 if (level1[l1] == 0xFF)
7892 level1[l1] = count2++;
7893 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00007894 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007895 }
7896
7897 if (count2 >= 0xFF || count3 >= 0xFF)
7898 need_dict = 1;
7899
7900 if (need_dict) {
7901 PyObject *result = PyDict_New();
7902 PyObject *key, *value;
7903 if (!result)
7904 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007905 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007906 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00007907 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007908 if (!key || !value)
7909 goto failed1;
7910 if (PyDict_SetItem(result, key, value) == -1)
7911 goto failed1;
7912 Py_DECREF(key);
7913 Py_DECREF(value);
7914 }
7915 return result;
7916 failed1:
7917 Py_XDECREF(key);
7918 Py_XDECREF(value);
7919 Py_DECREF(result);
7920 return NULL;
7921 }
7922
7923 /* Create a three-level trie */
7924 result = PyObject_MALLOC(sizeof(struct encoding_map) +
7925 16*count2 + 128*count3 - 1);
7926 if (!result)
7927 return PyErr_NoMemory();
7928 PyObject_Init(result, &EncodingMapType);
7929 mresult = (struct encoding_map*)result;
7930 mresult->count2 = count2;
7931 mresult->count3 = count3;
7932 mlevel1 = mresult->level1;
7933 mlevel2 = mresult->level23;
7934 mlevel3 = mresult->level23 + 16*count2;
7935 memcpy(mlevel1, level1, 32);
7936 memset(mlevel2, 0xFF, 16*count2);
7937 memset(mlevel3, 0, 128*count3);
7938 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007939 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007940 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007941 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7942 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007943 /* unmapped character */
7944 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007945 o1 = ch>>11;
7946 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007947 i2 = 16*mlevel1[o1] + o2;
7948 if (mlevel2[i2] == 0xFF)
7949 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007950 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007951 i3 = 128*mlevel2[i2] + o3;
7952 mlevel3[i3] = i;
7953 }
7954 return result;
7955}
7956
7957static int
Victor Stinner22168992011-11-20 17:09:18 +01007958encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007959{
7960 struct encoding_map *map = (struct encoding_map*)mapping;
7961 int l1 = c>>11;
7962 int l2 = (c>>7) & 0xF;
7963 int l3 = c & 0x7F;
7964 int i;
7965
Victor Stinner22168992011-11-20 17:09:18 +01007966 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00007967 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007968 if (c == 0)
7969 return 0;
7970 /* level 1*/
7971 i = map->level1[l1];
7972 if (i == 0xFF) {
7973 return -1;
7974 }
7975 /* level 2*/
7976 i = map->level23[16*i+l2];
7977 if (i == 0xFF) {
7978 return -1;
7979 }
7980 /* level 3 */
7981 i = map->level23[16*map->count2 + 128*i + l3];
7982 if (i == 0) {
7983 return -1;
7984 }
7985 return i;
7986}
7987
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007988/* Lookup the character ch in the mapping. If the character
7989 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00007990 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007991static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01007992charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007993{
Christian Heimes217cfd12007-12-02 14:31:20 +00007994 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007995 PyObject *x;
7996
7997 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007998 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007999 x = PyObject_GetItem(mapping, w);
8000 Py_DECREF(w);
8001 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008002 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8003 /* No mapping found means: mapping is undefined. */
8004 PyErr_Clear();
8005 x = Py_None;
8006 Py_INCREF(x);
8007 return x;
8008 } else
8009 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008010 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008011 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008012 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008013 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008014 long value = PyLong_AS_LONG(x);
8015 if (value < 0 || value > 255) {
8016 PyErr_SetString(PyExc_TypeError,
8017 "character mapping must be in range(256)");
8018 Py_DECREF(x);
8019 return NULL;
8020 }
8021 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008022 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008023 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008024 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008025 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008026 /* wrong return value */
8027 PyErr_Format(PyExc_TypeError,
8028 "character mapping must return integer, bytes or None, not %.400s",
8029 x->ob_type->tp_name);
8030 Py_DECREF(x);
8031 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008032 }
8033}
8034
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008035static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008036charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008037{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008038 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8039 /* exponentially overallocate to minimize reallocations */
8040 if (requiredsize < 2*outsize)
8041 requiredsize = 2*outsize;
8042 if (_PyBytes_Resize(outobj, requiredsize))
8043 return -1;
8044 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008045}
8046
Benjamin Peterson14339b62009-01-31 16:36:08 +00008047typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008048 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008049} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008050/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008051 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008052 space is available. Return a new reference to the object that
8053 was put in the output buffer, or Py_None, if the mapping was undefined
8054 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008055 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008056static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008057charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008058 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008059{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008060 PyObject *rep;
8061 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008062 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008063
Christian Heimes90aa7642007-12-19 02:45:37 +00008064 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008065 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008066 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008067 if (res == -1)
8068 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008069 if (outsize<requiredsize)
8070 if (charmapencode_resize(outobj, outpos, requiredsize))
8071 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008072 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008073 outstart[(*outpos)++] = (char)res;
8074 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008075 }
8076
8077 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008078 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008079 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008080 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008081 Py_DECREF(rep);
8082 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008083 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008084 if (PyLong_Check(rep)) {
8085 Py_ssize_t requiredsize = *outpos+1;
8086 if (outsize<requiredsize)
8087 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8088 Py_DECREF(rep);
8089 return enc_EXCEPTION;
8090 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008091 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008092 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008093 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008094 else {
8095 const char *repchars = PyBytes_AS_STRING(rep);
8096 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8097 Py_ssize_t requiredsize = *outpos+repsize;
8098 if (outsize<requiredsize)
8099 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8100 Py_DECREF(rep);
8101 return enc_EXCEPTION;
8102 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008103 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008104 memcpy(outstart + *outpos, repchars, repsize);
8105 *outpos += repsize;
8106 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008107 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008108 Py_DECREF(rep);
8109 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008110}
8111
8112/* handle an error in PyUnicode_EncodeCharmap
8113 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008114static int
8115charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008116 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008117 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00008118 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008119 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008120{
8121 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008122 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008123 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008124 enum PyUnicode_Kind kind;
8125 void *data;
8126 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008127 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008128 Py_ssize_t collstartpos = *inpos;
8129 Py_ssize_t collendpos = *inpos+1;
8130 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008131 char *encoding = "charmap";
8132 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008133 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008134 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008135 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008136
Benjamin Petersonbac79492012-01-14 13:34:47 -05008137 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008138 return -1;
8139 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008140 /* find all unencodable characters */
8141 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008142 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008143 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008144 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008145 val = encoding_map_lookup(ch, mapping);
8146 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008147 break;
8148 ++collendpos;
8149 continue;
8150 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008151
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008152 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8153 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008154 if (rep==NULL)
8155 return -1;
8156 else if (rep!=Py_None) {
8157 Py_DECREF(rep);
8158 break;
8159 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008160 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008161 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008162 }
8163 /* cache callback name lookup
8164 * (if not done yet, i.e. it's the first error) */
8165 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008166 if ((errors==NULL) || (!strcmp(errors, "strict")))
8167 *known_errorHandler = 1;
8168 else if (!strcmp(errors, "replace"))
8169 *known_errorHandler = 2;
8170 else if (!strcmp(errors, "ignore"))
8171 *known_errorHandler = 3;
8172 else if (!strcmp(errors, "xmlcharrefreplace"))
8173 *known_errorHandler = 4;
8174 else
8175 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008176 }
8177 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008178 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008179 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008180 return -1;
8181 case 2: /* replace */
8182 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008183 x = charmapencode_output('?', mapping, res, respos);
8184 if (x==enc_EXCEPTION) {
8185 return -1;
8186 }
8187 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008188 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008189 return -1;
8190 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008191 }
8192 /* fall through */
8193 case 3: /* ignore */
8194 *inpos = collendpos;
8195 break;
8196 case 4: /* xmlcharrefreplace */
8197 /* generate replacement (temporarily (mis)uses p) */
8198 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008199 char buffer[2+29+1+1];
8200 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008201 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008202 for (cp = buffer; *cp; ++cp) {
8203 x = charmapencode_output(*cp, mapping, res, respos);
8204 if (x==enc_EXCEPTION)
8205 return -1;
8206 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008207 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008208 return -1;
8209 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008210 }
8211 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008212 *inpos = collendpos;
8213 break;
8214 default:
8215 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008216 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008217 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008218 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008219 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008220 if (PyBytes_Check(repunicode)) {
8221 /* Directly copy bytes result to output. */
8222 Py_ssize_t outsize = PyBytes_Size(*res);
8223 Py_ssize_t requiredsize;
8224 repsize = PyBytes_Size(repunicode);
8225 requiredsize = *respos + repsize;
8226 if (requiredsize > outsize)
8227 /* Make room for all additional bytes. */
8228 if (charmapencode_resize(res, respos, requiredsize)) {
8229 Py_DECREF(repunicode);
8230 return -1;
8231 }
8232 memcpy(PyBytes_AsString(*res) + *respos,
8233 PyBytes_AsString(repunicode), repsize);
8234 *respos += repsize;
8235 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008236 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008237 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008238 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008239 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008240 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008241 Py_DECREF(repunicode);
8242 return -1;
8243 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008244 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008245 data = PyUnicode_DATA(repunicode);
8246 kind = PyUnicode_KIND(repunicode);
8247 for (index = 0; index < repsize; index++) {
8248 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8249 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008250 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008251 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008252 return -1;
8253 }
8254 else if (x==enc_FAILED) {
8255 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008256 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008257 return -1;
8258 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008259 }
8260 *inpos = newpos;
8261 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008262 }
8263 return 0;
8264}
8265
Alexander Belopolsky40018472011-02-26 01:02:56 +00008266PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008267_PyUnicode_EncodeCharmap(PyObject *unicode,
8268 PyObject *mapping,
8269 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008270{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008271 /* output object */
8272 PyObject *res = NULL;
8273 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008274 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008275 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008276 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008277 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008278 PyObject *errorHandler = NULL;
8279 PyObject *exc = NULL;
8280 /* the following variable is used for caching string comparisons
8281 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8282 * 3=ignore, 4=xmlcharrefreplace */
8283 int known_errorHandler = -1;
Victor Stinner69ed0f42013-04-09 21:48:24 +02008284 void *data;
8285 int kind;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008286
Benjamin Petersonbac79492012-01-14 13:34:47 -05008287 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008288 return NULL;
8289 size = PyUnicode_GET_LENGTH(unicode);
Victor Stinner69ed0f42013-04-09 21:48:24 +02008290 data = PyUnicode_DATA(unicode);
8291 kind = PyUnicode_KIND(unicode);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008292
Guido van Rossumd57fd912000-03-10 22:53:23 +00008293 /* Default to Latin-1 */
8294 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008295 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008296
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008297 /* allocate enough for a simple encoding without
8298 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008299 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008300 if (res == NULL)
8301 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008302 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008303 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008304
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008305 while (inpos<size) {
Victor Stinner69ed0f42013-04-09 21:48:24 +02008306 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008307 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008308 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008309 if (x==enc_EXCEPTION) /* error */
8310 goto onError;
8311 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008312 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008313 &exc,
8314 &known_errorHandler, &errorHandler, errors,
8315 &res, &respos)) {
8316 goto onError;
8317 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008318 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008319 else
8320 /* done with this character => adjust input position */
8321 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008322 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008323
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008324 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008325 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008326 if (_PyBytes_Resize(&res, respos) < 0)
8327 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008328
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008329 Py_XDECREF(exc);
8330 Py_XDECREF(errorHandler);
8331 return res;
8332
Benjamin Peterson29060642009-01-31 22:14:21 +00008333 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008334 Py_XDECREF(res);
8335 Py_XDECREF(exc);
8336 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008337 return NULL;
8338}
8339
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008340/* Deprecated */
8341PyObject *
8342PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8343 Py_ssize_t size,
8344 PyObject *mapping,
8345 const char *errors)
8346{
8347 PyObject *result;
8348 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8349 if (unicode == NULL)
8350 return NULL;
8351 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8352 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008353 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008354}
8355
Alexander Belopolsky40018472011-02-26 01:02:56 +00008356PyObject *
8357PyUnicode_AsCharmapString(PyObject *unicode,
8358 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008359{
8360 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008361 PyErr_BadArgument();
8362 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008363 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008364 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008365}
8366
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008367/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008368static void
8369make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008370 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008371 Py_ssize_t startpos, Py_ssize_t endpos,
8372 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008373{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008374 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008375 *exceptionObject = _PyUnicodeTranslateError_Create(
8376 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008377 }
8378 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008379 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8380 goto onError;
8381 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8382 goto onError;
8383 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8384 goto onError;
8385 return;
8386 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02008387 Py_CLEAR(*exceptionObject);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008388 }
8389}
8390
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008391/* error handling callback helper:
8392 build arguments, call the callback and check the arguments,
8393 put the result into newpos and return the replacement string, which
8394 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008395static PyObject *
8396unicode_translate_call_errorhandler(const char *errors,
8397 PyObject **errorHandler,
8398 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008399 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008400 Py_ssize_t startpos, Py_ssize_t endpos,
8401 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008402{
Benjamin Peterson142957c2008-07-04 19:55:29 +00008403 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008404
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008405 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008406 PyObject *restuple;
8407 PyObject *resunicode;
8408
8409 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008410 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008411 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008412 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008413 }
8414
8415 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008416 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008417 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008418 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008419
8420 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00008421 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008422 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008423 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008424 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00008425 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008426 Py_DECREF(restuple);
8427 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008428 }
8429 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00008430 &resunicode, &i_newpos)) {
8431 Py_DECREF(restuple);
8432 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008433 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008434 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008435 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008436 else
8437 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008438 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008439 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
8440 Py_DECREF(restuple);
8441 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008442 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008443 Py_INCREF(resunicode);
8444 Py_DECREF(restuple);
8445 return resunicode;
8446}
8447
8448/* Lookup the character ch in the mapping and put the result in result,
8449 which must be decrefed by the caller.
8450 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008451static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008452charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008453{
Christian Heimes217cfd12007-12-02 14:31:20 +00008454 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008455 PyObject *x;
8456
8457 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008458 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008459 x = PyObject_GetItem(mapping, w);
8460 Py_DECREF(w);
8461 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008462 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8463 /* No mapping found means: use 1:1 mapping. */
8464 PyErr_Clear();
8465 *result = NULL;
8466 return 0;
8467 } else
8468 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008469 }
8470 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008471 *result = x;
8472 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008473 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008474 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008475 long value = PyLong_AS_LONG(x);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008476 if (value < 0 || value > MAX_UNICODE) {
8477 PyErr_Format(PyExc_ValueError,
8478 "character mapping must be in range(0x%x)",
8479 MAX_UNICODE+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008480 Py_DECREF(x);
8481 return -1;
8482 }
8483 *result = x;
8484 return 0;
8485 }
8486 else if (PyUnicode_Check(x)) {
8487 *result = x;
8488 return 0;
8489 }
8490 else {
8491 /* wrong return value */
8492 PyErr_SetString(PyExc_TypeError,
8493 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008494 Py_DECREF(x);
8495 return -1;
8496 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008497}
Victor Stinner1194ea02014-04-04 19:37:40 +02008498
8499/* lookup the character, write the result into the writer.
8500 Return 1 if the result was written into the writer, return 0 if the mapping
8501 was undefined, raise an exception return -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008502static int
Victor Stinner1194ea02014-04-04 19:37:40 +02008503charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
8504 _PyUnicodeWriter *writer)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008505{
Victor Stinner1194ea02014-04-04 19:37:40 +02008506 PyObject *item;
8507
8508 if (charmaptranslate_lookup(ch, mapping, &item))
Benjamin Peterson29060642009-01-31 22:14:21 +00008509 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008510
8511 if (item == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008512 /* not found => default to 1:1 mapping */
Victor Stinner1194ea02014-04-04 19:37:40 +02008513 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008514 return -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008515 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008516 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008517 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008518
8519 if (item == Py_None) {
8520 Py_DECREF(item);
8521 return 0;
8522 }
8523
8524 if (PyLong_Check(item)) {
Victor Stinner4ff33af2014-04-05 11:56:37 +02008525 long ch = (Py_UCS4)PyLong_AS_LONG(item);
8526 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8527 used it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008528 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
8529 Py_DECREF(item);
8530 return -1;
8531 }
8532 Py_DECREF(item);
8533 return 1;
8534 }
8535
8536 if (!PyUnicode_Check(item)) {
8537 Py_DECREF(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00008538 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008539 }
8540
8541 if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
8542 Py_DECREF(item);
8543 return -1;
8544 }
8545
8546 Py_DECREF(item);
8547 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008548}
8549
Victor Stinner89a76ab2014-04-05 11:44:04 +02008550static int
8551unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
8552 Py_UCS1 *translate)
8553{
Benjamin Peterson1365de72014-04-07 20:15:41 -04008554 PyObject *item = NULL;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008555 int ret = 0;
8556
Victor Stinner89a76ab2014-04-05 11:44:04 +02008557 if (charmaptranslate_lookup(ch, mapping, &item)) {
8558 return -1;
8559 }
8560
8561 if (item == Py_None) {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008562 /* deletion */
Victor Stinner872b2912014-04-05 14:27:07 +02008563 translate[ch] = 0xfe;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008564 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008565 else if (item == NULL) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008566 /* not found => default to 1:1 mapping */
8567 translate[ch] = ch;
8568 return 1;
8569 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008570 else if (PyLong_Check(item)) {
Victor Stinner4dd25252014-04-08 09:14:21 +02008571 long replace = PyLong_AS_LONG(item);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008572 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8573 used it */
8574 if (127 < replace) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008575 /* invalid character or character outside ASCII:
8576 skip the fast translate */
8577 goto exit;
8578 }
8579 translate[ch] = (Py_UCS1)replace;
8580 }
8581 else if (PyUnicode_Check(item)) {
8582 Py_UCS4 replace;
8583
8584 if (PyUnicode_READY(item) == -1) {
8585 Py_DECREF(item);
8586 return -1;
8587 }
8588 if (PyUnicode_GET_LENGTH(item) != 1)
8589 goto exit;
8590
8591 replace = PyUnicode_READ_CHAR(item, 0);
8592 if (replace > 127)
8593 goto exit;
8594 translate[ch] = (Py_UCS1)replace;
8595 }
8596 else {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008597 /* not None, NULL, long or unicode */
Victor Stinner89a76ab2014-04-05 11:44:04 +02008598 goto exit;
8599 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02008600 ret = 1;
8601
Benjamin Peterson1365de72014-04-07 20:15:41 -04008602 exit:
8603 Py_DECREF(item);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008604 return ret;
8605}
8606
8607/* Fast path for ascii => ascii translation. Return 1 if the whole string
8608 was translated into writer, return 0 if the input string was partially
8609 translated into writer, raise an exception and return -1 on error. */
8610static int
8611unicode_fast_translate(PyObject *input, PyObject *mapping,
Victor Stinner872b2912014-04-05 14:27:07 +02008612 _PyUnicodeWriter *writer, int ignore)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008613{
Victor Stinner872b2912014-04-05 14:27:07 +02008614 Py_UCS1 ascii_table[128], ch, ch2;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008615 Py_ssize_t len;
8616 Py_UCS1 *in, *end, *out;
Victor Stinner872b2912014-04-05 14:27:07 +02008617 int res = 0;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008618
8619 if (PyUnicode_READY(input) == -1)
8620 return -1;
8621 if (!PyUnicode_IS_ASCII(input))
8622 return 0;
8623 len = PyUnicode_GET_LENGTH(input);
8624
Victor Stinner872b2912014-04-05 14:27:07 +02008625 memset(ascii_table, 0xff, 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008626
8627 in = PyUnicode_1BYTE_DATA(input);
8628 end = in + len;
8629
8630 assert(PyUnicode_IS_ASCII(writer->buffer));
8631 assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
8632 out = PyUnicode_1BYTE_DATA(writer->buffer);
8633
Victor Stinner872b2912014-04-05 14:27:07 +02008634 for (; in < end; in++) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008635 ch = *in;
Victor Stinner872b2912014-04-05 14:27:07 +02008636 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008637 if (ch2 == 0xff) {
Victor Stinner872b2912014-04-05 14:27:07 +02008638 int translate = unicode_fast_translate_lookup(mapping, ch,
8639 ascii_table);
8640 if (translate < 0)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008641 return -1;
Victor Stinner872b2912014-04-05 14:27:07 +02008642 if (translate == 0)
8643 goto exit;
8644 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008645 }
Victor Stinner872b2912014-04-05 14:27:07 +02008646 if (ch2 == 0xfe) {
8647 if (ignore)
8648 continue;
8649 goto exit;
8650 }
8651 assert(ch2 < 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008652 *out = ch2;
Victor Stinner872b2912014-04-05 14:27:07 +02008653 out++;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008654 }
Victor Stinner872b2912014-04-05 14:27:07 +02008655 res = 1;
8656
8657exit:
8658 writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
8659 return res;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008660}
8661
Alexander Belopolsky40018472011-02-26 01:02:56 +00008662PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008663_PyUnicode_TranslateCharmap(PyObject *input,
8664 PyObject *mapping,
8665 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008666{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008667 /* input object */
Victor Stinner1194ea02014-04-04 19:37:40 +02008668 char *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008669 Py_ssize_t size, i;
8670 int kind;
8671 /* output buffer */
Victor Stinner1194ea02014-04-04 19:37:40 +02008672 _PyUnicodeWriter writer;
8673 /* error handler */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008674 char *reason = "character maps to <undefined>";
8675 PyObject *errorHandler = NULL;
8676 PyObject *exc = NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008677 int ignore;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008678 int res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008679
Guido van Rossumd57fd912000-03-10 22:53:23 +00008680 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008681 PyErr_BadArgument();
8682 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008683 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008684
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008685 if (PyUnicode_READY(input) == -1)
8686 return NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008687 data = (char*)PyUnicode_DATA(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008688 kind = PyUnicode_KIND(input);
8689 size = PyUnicode_GET_LENGTH(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008690
8691 if (size == 0) {
8692 Py_INCREF(input);
8693 return input;
8694 }
8695
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008696 /* allocate enough for a simple 1:1 translation without
8697 replacements, if we need more, we'll resize */
Victor Stinner1194ea02014-04-04 19:37:40 +02008698 _PyUnicodeWriter_Init(&writer);
8699 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008700 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008701
Victor Stinner872b2912014-04-05 14:27:07 +02008702 ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
8703
8704 res = unicode_fast_translate(input, mapping, &writer, ignore);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008705 if (res < 0) {
8706 _PyUnicodeWriter_Dealloc(&writer);
8707 return NULL;
8708 }
8709 if (res == 1)
8710 return _PyUnicodeWriter_Finish(&writer);
8711
Victor Stinner89a76ab2014-04-05 11:44:04 +02008712 i = writer.pos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008713 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008714 /* try to encode it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008715 int translate;
8716 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8717 Py_ssize_t newpos;
8718 /* startpos for collecting untranslatable chars */
8719 Py_ssize_t collstart;
8720 Py_ssize_t collend;
Victor Stinner1194ea02014-04-04 19:37:40 +02008721 Py_UCS4 ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008722
Victor Stinner1194ea02014-04-04 19:37:40 +02008723 ch = PyUnicode_READ(kind, data, i);
8724 translate = charmaptranslate_output(ch, mapping, &writer);
8725 if (translate < 0)
8726 goto onError;
8727
8728 if (translate != 0) {
8729 /* it worked => adjust input pointer */
8730 ++i;
8731 continue;
8732 }
8733
8734 /* untranslatable character */
8735 collstart = i;
8736 collend = i+1;
8737
8738 /* find all untranslatable characters */
8739 while (collend < size) {
8740 PyObject *x;
8741 ch = PyUnicode_READ(kind, data, collend);
8742 if (charmaptranslate_lookup(ch, mapping, &x))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008743 goto onError;
Victor Stinner1194ea02014-04-04 19:37:40 +02008744 Py_XDECREF(x);
8745 if (x != Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008746 break;
Victor Stinner1194ea02014-04-04 19:37:40 +02008747 ++collend;
8748 }
8749
8750 if (ignore) {
8751 i = collend;
8752 }
8753 else {
8754 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
8755 reason, input, &exc,
8756 collstart, collend, &newpos);
8757 if (repunicode == NULL)
8758 goto onError;
8759 if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008760 Py_DECREF(repunicode);
Victor Stinner1194ea02014-04-04 19:37:40 +02008761 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008762 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008763 Py_DECREF(repunicode);
8764 i = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008765 }
8766 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008767 Py_XDECREF(exc);
8768 Py_XDECREF(errorHandler);
Victor Stinner1194ea02014-04-04 19:37:40 +02008769 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008770
Benjamin Peterson29060642009-01-31 22:14:21 +00008771 onError:
Victor Stinner1194ea02014-04-04 19:37:40 +02008772 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008773 Py_XDECREF(exc);
8774 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008775 return NULL;
8776}
8777
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008778/* Deprecated. Use PyUnicode_Translate instead. */
8779PyObject *
8780PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8781 Py_ssize_t size,
8782 PyObject *mapping,
8783 const char *errors)
8784{
Christian Heimes5f520f42012-09-11 14:03:25 +02008785 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008786 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8787 if (!unicode)
8788 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02008789 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8790 Py_DECREF(unicode);
8791 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008792}
8793
Alexander Belopolsky40018472011-02-26 01:02:56 +00008794PyObject *
8795PyUnicode_Translate(PyObject *str,
8796 PyObject *mapping,
8797 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008798{
8799 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008800
Guido van Rossumd57fd912000-03-10 22:53:23 +00008801 str = PyUnicode_FromObject(str);
8802 if (str == NULL)
Christian Heimes5f520f42012-09-11 14:03:25 +02008803 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008804 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008805 Py_DECREF(str);
8806 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008807}
Tim Petersced69f82003-09-16 20:30:58 +00008808
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008809static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008810fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008811{
8812 /* No need to call PyUnicode_READY(self) because this function is only
8813 called as a callback from fixup() which does it already. */
8814 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8815 const int kind = PyUnicode_KIND(self);
8816 void *data = PyUnicode_DATA(self);
Victor Stinnere6abb482012-05-02 01:15:40 +02008817 Py_UCS4 maxchar = 127, ch, fixed;
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008818 int modified = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008819 Py_ssize_t i;
8820
8821 for (i = 0; i < len; ++i) {
8822 ch = PyUnicode_READ(kind, data, i);
8823 fixed = 0;
8824 if (ch > 127) {
8825 if (Py_UNICODE_ISSPACE(ch))
8826 fixed = ' ';
8827 else {
8828 const int decimal = Py_UNICODE_TODECIMAL(ch);
8829 if (decimal >= 0)
8830 fixed = '0' + decimal;
8831 }
8832 if (fixed != 0) {
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008833 modified = 1;
Benjamin Peterson7e303732013-06-10 09:19:46 -07008834 maxchar = Py_MAX(maxchar, fixed);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008835 PyUnicode_WRITE(kind, data, i, fixed);
8836 }
Victor Stinnere6abb482012-05-02 01:15:40 +02008837 else
Benjamin Peterson7e303732013-06-10 09:19:46 -07008838 maxchar = Py_MAX(maxchar, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008839 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008840 }
8841
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008842 return (modified) ? maxchar : 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008843}
8844
8845PyObject *
8846_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8847{
8848 if (!PyUnicode_Check(unicode)) {
8849 PyErr_BadInternalCall();
8850 return NULL;
8851 }
8852 if (PyUnicode_READY(unicode) == -1)
8853 return NULL;
8854 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8855 /* If the string is already ASCII, just return the same string */
8856 Py_INCREF(unicode);
8857 return unicode;
8858 }
Victor Stinner9310abb2011-10-05 00:59:23 +02008859 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008860}
8861
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008862PyObject *
8863PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8864 Py_ssize_t length)
8865{
Victor Stinnerf0124502011-11-21 23:12:56 +01008866 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008867 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01008868 Py_UCS4 maxchar;
8869 enum PyUnicode_Kind kind;
8870 void *data;
8871
Victor Stinner99d7ad02012-02-22 13:37:39 +01008872 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008873 for (i = 0; i < length; i++) {
Victor Stinnerf0124502011-11-21 23:12:56 +01008874 Py_UNICODE ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008875 if (ch > 127) {
8876 int decimal = Py_UNICODE_TODECIMAL(ch);
8877 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01008878 ch = '0' + decimal;
Benjamin Peterson7e303732013-06-10 09:19:46 -07008879 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008880 }
8881 }
Victor Stinnerf0124502011-11-21 23:12:56 +01008882
8883 /* Copy to a new string */
8884 decimal = PyUnicode_New(length, maxchar);
8885 if (decimal == NULL)
8886 return decimal;
8887 kind = PyUnicode_KIND(decimal);
8888 data = PyUnicode_DATA(decimal);
8889 /* Iterate over code points */
8890 for (i = 0; i < length; i++) {
8891 Py_UNICODE ch = s[i];
8892 if (ch > 127) {
8893 int decimal = Py_UNICODE_TODECIMAL(ch);
8894 if (decimal >= 0)
8895 ch = '0' + decimal;
8896 }
8897 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008898 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008899 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008900}
Guido van Rossum9e896b32000-04-05 20:11:21 +00008901/* --- Decimal Encoder ---------------------------------------------------- */
8902
Alexander Belopolsky40018472011-02-26 01:02:56 +00008903int
8904PyUnicode_EncodeDecimal(Py_UNICODE *s,
8905 Py_ssize_t length,
8906 char *output,
8907 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00008908{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008909 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01008910 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01008911 enum PyUnicode_Kind kind;
8912 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008913
8914 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008915 PyErr_BadArgument();
8916 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008917 }
8918
Victor Stinner42bf7752011-11-21 22:52:58 +01008919 unicode = PyUnicode_FromUnicode(s, length);
8920 if (unicode == NULL)
8921 return -1;
8922
Benjamin Petersonbac79492012-01-14 13:34:47 -05008923 if (PyUnicode_READY(unicode) == -1) {
Victor Stinner6345be92011-11-25 20:09:01 +01008924 Py_DECREF(unicode);
8925 return -1;
8926 }
Victor Stinner42bf7752011-11-21 22:52:58 +01008927 kind = PyUnicode_KIND(unicode);
8928 data = PyUnicode_DATA(unicode);
8929
Victor Stinnerb84d7232011-11-22 01:50:07 +01008930 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01008931 PyObject *exc;
8932 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00008933 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01008934 Py_ssize_t startpos;
8935
8936 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00008937
Benjamin Peterson29060642009-01-31 22:14:21 +00008938 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008939 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01008940 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008941 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008942 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008943 decimal = Py_UNICODE_TODECIMAL(ch);
8944 if (decimal >= 0) {
8945 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008946 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008947 continue;
8948 }
8949 if (0 < ch && ch < 256) {
8950 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008951 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008952 continue;
8953 }
Victor Stinner6345be92011-11-25 20:09:01 +01008954
Victor Stinner42bf7752011-11-21 22:52:58 +01008955 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01008956 exc = NULL;
8957 raise_encode_exception(&exc, "decimal", unicode,
8958 startpos, startpos+1,
8959 "invalid decimal Unicode string");
8960 Py_XDECREF(exc);
8961 Py_DECREF(unicode);
8962 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008963 }
8964 /* 0-terminate the output string */
8965 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01008966 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008967 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008968}
8969
Guido van Rossumd57fd912000-03-10 22:53:23 +00008970/* --- Helpers ------------------------------------------------------------ */
8971
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008972static Py_ssize_t
Victor Stinner794d5672011-10-10 03:21:36 +02008973any_find_slice(int direction, PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008974 Py_ssize_t start,
8975 Py_ssize_t end)
8976{
8977 int kind1, kind2, kind;
8978 void *buf1, *buf2;
8979 Py_ssize_t len1, len2, result;
8980
8981 kind1 = PyUnicode_KIND(s1);
8982 kind2 = PyUnicode_KIND(s2);
8983 kind = kind1 > kind2 ? kind1 : kind2;
8984 buf1 = PyUnicode_DATA(s1);
8985 buf2 = PyUnicode_DATA(s2);
8986 if (kind1 != kind)
8987 buf1 = _PyUnicode_AsKind(s1, kind);
8988 if (!buf1)
8989 return -2;
8990 if (kind2 != kind)
8991 buf2 = _PyUnicode_AsKind(s2, kind);
8992 if (!buf2) {
8993 if (kind1 != kind) PyMem_Free(buf1);
8994 return -2;
8995 }
8996 len1 = PyUnicode_GET_LENGTH(s1);
8997 len2 = PyUnicode_GET_LENGTH(s2);
8998
Victor Stinner794d5672011-10-10 03:21:36 +02008999 if (direction > 0) {
Benjamin Petersonead6b532011-12-20 17:23:42 -06009000 switch (kind) {
Victor Stinner794d5672011-10-10 03:21:36 +02009001 case PyUnicode_1BYTE_KIND:
9002 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9003 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9004 else
9005 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9006 break;
9007 case PyUnicode_2BYTE_KIND:
9008 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9009 break;
9010 case PyUnicode_4BYTE_KIND:
9011 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9012 break;
9013 default:
9014 assert(0); result = -2;
9015 }
9016 }
9017 else {
Benjamin Petersonead6b532011-12-20 17:23:42 -06009018 switch (kind) {
Victor Stinner794d5672011-10-10 03:21:36 +02009019 case PyUnicode_1BYTE_KIND:
9020 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9021 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9022 else
9023 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9024 break;
9025 case PyUnicode_2BYTE_KIND:
9026 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9027 break;
9028 case PyUnicode_4BYTE_KIND:
9029 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9030 break;
9031 default:
9032 assert(0); result = -2;
9033 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009034 }
9035
9036 if (kind1 != kind)
9037 PyMem_Free(buf1);
9038 if (kind2 != kind)
9039 PyMem_Free(buf2);
9040
9041 return result;
9042}
9043
9044Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01009045_PyUnicode_InsertThousandsGrouping(
9046 PyObject *unicode, Py_ssize_t index,
9047 Py_ssize_t n_buffer,
9048 void *digits, Py_ssize_t n_digits,
9049 Py_ssize_t min_width,
9050 const char *grouping, PyObject *thousands_sep,
9051 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009052{
Victor Stinner41a863c2012-02-24 00:37:51 +01009053 unsigned int kind, thousands_sep_kind;
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009054 char *data, *thousands_sep_data;
Victor Stinner41a863c2012-02-24 00:37:51 +01009055 Py_ssize_t thousands_sep_len;
9056 Py_ssize_t len;
9057
9058 if (unicode != NULL) {
9059 kind = PyUnicode_KIND(unicode);
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009060 data = (char *) PyUnicode_DATA(unicode) + index * kind;
Victor Stinner41a863c2012-02-24 00:37:51 +01009061 }
9062 else {
9063 kind = PyUnicode_1BYTE_KIND;
9064 data = NULL;
9065 }
9066 thousands_sep_kind = PyUnicode_KIND(thousands_sep);
9067 thousands_sep_data = PyUnicode_DATA(thousands_sep);
9068 thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9069 if (unicode != NULL && thousands_sep_kind != kind) {
Victor Stinner90f50d42012-02-24 01:44:47 +01009070 if (thousands_sep_kind < kind) {
9071 thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind);
9072 if (!thousands_sep_data)
9073 return -1;
9074 }
9075 else {
9076 data = _PyUnicode_AsKind(unicode, thousands_sep_kind);
9077 if (!data)
9078 return -1;
9079 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009080 }
9081
Benjamin Petersonead6b532011-12-20 17:23:42 -06009082 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009083 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009084 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
Victor Stinner41a863c2012-02-24 00:37:51 +01009085 len = asciilib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009086 (Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009087 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009088 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009089 else
Victor Stinner41a863c2012-02-24 00:37:51 +01009090 len = ucs1lib_InsertThousandsGrouping(
Victor Stinnerc3cec782011-10-05 21:24:08 +02009091 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009092 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009093 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009094 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009095 case PyUnicode_2BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009096 len = ucs2lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009097 (Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009098 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009099 (Py_UCS2 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009100 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009101 case PyUnicode_4BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009102 len = ucs4lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009103 (Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009104 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009105 (Py_UCS4 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009106 break;
9107 default:
9108 assert(0);
9109 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009110 }
Victor Stinner90f50d42012-02-24 01:44:47 +01009111 if (unicode != NULL && thousands_sep_kind != kind) {
9112 if (thousands_sep_kind < kind)
9113 PyMem_Free(thousands_sep_data);
9114 else
9115 PyMem_Free(data);
9116 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009117 if (unicode == NULL) {
9118 *maxchar = 127;
9119 if (len != n_digits) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009120 *maxchar = Py_MAX(*maxchar,
Victor Stinnere6abb482012-05-02 01:15:40 +02009121 PyUnicode_MAX_CHAR_VALUE(thousands_sep));
Victor Stinner41a863c2012-02-24 00:37:51 +01009122 }
9123 }
9124 return len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009125}
9126
9127
Thomas Wouters477c8d52006-05-27 19:21:47 +00009128/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009129#define ADJUST_INDICES(start, end, len) \
9130 if (end > len) \
9131 end = len; \
9132 else if (end < 0) { \
9133 end += len; \
9134 if (end < 0) \
9135 end = 0; \
9136 } \
9137 if (start < 0) { \
9138 start += len; \
9139 if (start < 0) \
9140 start = 0; \
9141 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009142
Alexander Belopolsky40018472011-02-26 01:02:56 +00009143Py_ssize_t
9144PyUnicode_Count(PyObject *str,
9145 PyObject *substr,
9146 Py_ssize_t start,
9147 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009148{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009149 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009150 PyObject* str_obj;
9151 PyObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009152 int kind1, kind2, kind;
9153 void *buf1 = NULL, *buf2 = NULL;
9154 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009155
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009156 str_obj = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009157 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00009158 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009159 sub_obj = PyUnicode_FromObject(substr);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009160 if (!sub_obj) {
9161 Py_DECREF(str_obj);
9162 return -1;
9163 }
Benjamin Peterson4c13a4a2012-01-02 09:07:38 -06009164 if (PyUnicode_READY(sub_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
Benjamin Peterson5e458f52012-01-02 10:12:13 -06009165 Py_DECREF(sub_obj);
Benjamin Peterson29060642009-01-31 22:14:21 +00009166 Py_DECREF(str_obj);
9167 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009168 }
Tim Petersced69f82003-09-16 20:30:58 +00009169
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009170 kind1 = PyUnicode_KIND(str_obj);
9171 kind2 = PyUnicode_KIND(sub_obj);
Antoine Pitroue45c0c52012-05-12 15:49:07 +02009172 kind = kind1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009173 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009174 buf2 = PyUnicode_DATA(sub_obj);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -05009175 if (kind2 != kind) {
Antoine Pitrou758153b2012-05-12 15:51:51 +02009176 if (kind2 > kind) {
9177 Py_DECREF(sub_obj);
9178 Py_DECREF(str_obj);
Antoine Pitroue45c0c52012-05-12 15:49:07 +02009179 return 0;
Antoine Pitrou758153b2012-05-12 15:51:51 +02009180 }
Victor Stinner7931d9a2011-11-04 00:22:48 +01009181 buf2 = _PyUnicode_AsKind(sub_obj, kind);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -05009182 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009183 if (!buf2)
9184 goto onError;
9185 len1 = PyUnicode_GET_LENGTH(str_obj);
9186 len2 = PyUnicode_GET_LENGTH(sub_obj);
9187
9188 ADJUST_INDICES(start, end, len1);
Benjamin Petersonead6b532011-12-20 17:23:42 -06009189 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009190 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009191 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
9192 result = asciilib_count(
9193 ((Py_UCS1*)buf1) + start, end - start,
9194 buf2, len2, PY_SSIZE_T_MAX
9195 );
9196 else
9197 result = ucs1lib_count(
9198 ((Py_UCS1*)buf1) + start, end - start,
9199 buf2, len2, PY_SSIZE_T_MAX
9200 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009201 break;
9202 case PyUnicode_2BYTE_KIND:
9203 result = ucs2lib_count(
9204 ((Py_UCS2*)buf1) + start, end - start,
9205 buf2, len2, PY_SSIZE_T_MAX
9206 );
9207 break;
9208 case PyUnicode_4BYTE_KIND:
9209 result = ucs4lib_count(
9210 ((Py_UCS4*)buf1) + start, end - start,
9211 buf2, len2, PY_SSIZE_T_MAX
9212 );
9213 break;
9214 default:
9215 assert(0); result = 0;
9216 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009217
9218 Py_DECREF(sub_obj);
9219 Py_DECREF(str_obj);
9220
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009221 if (kind2 != kind)
9222 PyMem_Free(buf2);
9223
Guido van Rossumd57fd912000-03-10 22:53:23 +00009224 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009225 onError:
9226 Py_DECREF(sub_obj);
9227 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009228 if (kind2 != kind && buf2)
9229 PyMem_Free(buf2);
9230 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009231}
9232
Alexander Belopolsky40018472011-02-26 01:02:56 +00009233Py_ssize_t
9234PyUnicode_Find(PyObject *str,
9235 PyObject *sub,
9236 Py_ssize_t start,
9237 Py_ssize_t end,
9238 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009239{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009240 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009241
Guido van Rossumd57fd912000-03-10 22:53:23 +00009242 str = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009243 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00009244 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009245 sub = PyUnicode_FromObject(sub);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009246 if (!sub) {
9247 Py_DECREF(str);
9248 return -2;
9249 }
9250 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
9251 Py_DECREF(sub);
Benjamin Peterson29060642009-01-31 22:14:21 +00009252 Py_DECREF(str);
9253 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009254 }
Tim Petersced69f82003-09-16 20:30:58 +00009255
Victor Stinner794d5672011-10-10 03:21:36 +02009256 result = any_find_slice(direction,
9257 str, sub, start, end
9258 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00009259
Guido van Rossumd57fd912000-03-10 22:53:23 +00009260 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009261 Py_DECREF(sub);
9262
Guido van Rossumd57fd912000-03-10 22:53:23 +00009263 return result;
9264}
9265
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009266Py_ssize_t
9267PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9268 Py_ssize_t start, Py_ssize_t end,
9269 int direction)
9270{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009271 int kind;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009272 Py_ssize_t result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009273 if (PyUnicode_READY(str) == -1)
9274 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02009275 if (start < 0 || end < 0) {
9276 PyErr_SetString(PyExc_IndexError, "string index out of range");
9277 return -2;
9278 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009279 if (end > PyUnicode_GET_LENGTH(str))
9280 end = PyUnicode_GET_LENGTH(str);
9281 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009282 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9283 kind, end-start, ch, direction);
9284 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009285 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009286 else
9287 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009288}
9289
Alexander Belopolsky40018472011-02-26 01:02:56 +00009290static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009291tailmatch(PyObject *self,
9292 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009293 Py_ssize_t start,
9294 Py_ssize_t end,
9295 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009296{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009297 int kind_self;
9298 int kind_sub;
9299 void *data_self;
9300 void *data_sub;
9301 Py_ssize_t offset;
9302 Py_ssize_t i;
9303 Py_ssize_t end_sub;
9304
9305 if (PyUnicode_READY(self) == -1 ||
9306 PyUnicode_READY(substring) == -1)
Victor Stinner18aa4472013-01-03 03:18:09 +01009307 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009308
9309 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009310 return 1;
9311
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009312 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9313 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009314 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009315 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009316
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009317 kind_self = PyUnicode_KIND(self);
9318 data_self = PyUnicode_DATA(self);
9319 kind_sub = PyUnicode_KIND(substring);
9320 data_sub = PyUnicode_DATA(substring);
9321 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9322
9323 if (direction > 0)
9324 offset = end;
9325 else
9326 offset = start;
9327
9328 if (PyUnicode_READ(kind_self, data_self, offset) ==
9329 PyUnicode_READ(kind_sub, data_sub, 0) &&
9330 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9331 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9332 /* If both are of the same kind, memcmp is sufficient */
9333 if (kind_self == kind_sub) {
9334 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009335 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009336 data_sub,
9337 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009338 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009339 }
9340 /* otherwise we have to compare each character by first accesing it */
9341 else {
9342 /* We do not need to compare 0 and len(substring)-1 because
9343 the if statement above ensured already that they are equal
9344 when we end up here. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009345 for (i = 1; i < end_sub; ++i) {
9346 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9347 PyUnicode_READ(kind_sub, data_sub, i))
9348 return 0;
9349 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009350 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009351 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009352 }
9353
9354 return 0;
9355}
9356
Alexander Belopolsky40018472011-02-26 01:02:56 +00009357Py_ssize_t
9358PyUnicode_Tailmatch(PyObject *str,
9359 PyObject *substr,
9360 Py_ssize_t start,
9361 Py_ssize_t end,
9362 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009363{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009364 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009365
Guido van Rossumd57fd912000-03-10 22:53:23 +00009366 str = PyUnicode_FromObject(str);
9367 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009368 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009369 substr = PyUnicode_FromObject(substr);
9370 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009371 Py_DECREF(str);
9372 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009373 }
Tim Petersced69f82003-09-16 20:30:58 +00009374
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009375 result = tailmatch(str, substr,
Benjamin Peterson29060642009-01-31 22:14:21 +00009376 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009377 Py_DECREF(str);
9378 Py_DECREF(substr);
9379 return result;
9380}
9381
Guido van Rossumd57fd912000-03-10 22:53:23 +00009382/* Apply fixfct filter to the Unicode object self and return a
9383 reference to the modified object */
9384
Alexander Belopolsky40018472011-02-26 01:02:56 +00009385static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009386fixup(PyObject *self,
9387 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009388{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009389 PyObject *u;
9390 Py_UCS4 maxchar_old, maxchar_new = 0;
Victor Stinnereaab6042011-12-11 22:22:39 +01009391 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009392
Victor Stinnerbf6e5602011-12-12 01:53:47 +01009393 u = _PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009394 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009395 return NULL;
Victor Stinner87af4f22011-11-21 23:03:47 +01009396 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009397
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009398 /* fix functions return the new maximum character in a string,
9399 if the kind of the resulting unicode object does not change,
9400 everything is fine. Otherwise we need to change the string kind
9401 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009402 maxchar_new = fixfct(u);
Victor Stinnereaab6042011-12-11 22:22:39 +01009403
9404 if (maxchar_new == 0) {
9405 /* no changes */;
9406 if (PyUnicode_CheckExact(self)) {
9407 Py_DECREF(u);
9408 Py_INCREF(self);
9409 return self;
9410 }
9411 else
9412 return u;
9413 }
9414
Victor Stinnere6abb482012-05-02 01:15:40 +02009415 maxchar_new = align_maxchar(maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009416
Victor Stinnereaab6042011-12-11 22:22:39 +01009417 if (maxchar_new == maxchar_old)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009418 return u;
Victor Stinnereaab6042011-12-11 22:22:39 +01009419
9420 /* In case the maximum character changed, we need to
9421 convert the string to the new category. */
9422 v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
9423 if (v == NULL) {
9424 Py_DECREF(u);
9425 return NULL;
9426 }
9427 if (maxchar_new > maxchar_old) {
9428 /* If the maxchar increased so that the kind changed, not all
9429 characters are representable anymore and we need to fix the
9430 string again. This only happens in very few cases. */
Victor Stinnerd3f08822012-05-29 12:57:52 +02009431 _PyUnicode_FastCopyCharacters(v, 0,
9432 self, 0, PyUnicode_GET_LENGTH(self));
Victor Stinnereaab6042011-12-11 22:22:39 +01009433 maxchar_old = fixfct(v);
9434 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009435 }
9436 else {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009437 _PyUnicode_FastCopyCharacters(v, 0,
9438 u, 0, PyUnicode_GET_LENGTH(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009439 }
Victor Stinnereaab6042011-12-11 22:22:39 +01009440 Py_DECREF(u);
9441 assert(_PyUnicode_CheckConsistency(v, 1));
9442 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009443}
9444
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009445static PyObject *
9446ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009447{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009448 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9449 char *resdata, *data = PyUnicode_DATA(self);
9450 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009451
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009452 res = PyUnicode_New(len, 127);
9453 if (res == NULL)
9454 return NULL;
9455 resdata = PyUnicode_DATA(res);
9456 if (lower)
9457 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009458 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009459 _Py_bytes_upper(resdata, data, len);
9460 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009461}
9462
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009463static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009464handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009465{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009466 Py_ssize_t j;
9467 int final_sigma;
9468 Py_UCS4 c;
9469 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009470
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009471 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9472
9473 where ! is a negation and \p{xxx} is a character with property xxx.
9474 */
9475 for (j = i - 1; j >= 0; j--) {
9476 c = PyUnicode_READ(kind, data, j);
9477 if (!_PyUnicode_IsCaseIgnorable(c))
9478 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009479 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009480 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9481 if (final_sigma) {
9482 for (j = i + 1; j < length; j++) {
9483 c = PyUnicode_READ(kind, data, j);
9484 if (!_PyUnicode_IsCaseIgnorable(c))
9485 break;
9486 }
9487 final_sigma = j == length || !_PyUnicode_IsCased(c);
9488 }
9489 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009490}
9491
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009492static int
9493lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9494 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009495{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009496 /* Obscure special case. */
9497 if (c == 0x3A3) {
9498 mapped[0] = handle_capital_sigma(kind, data, length, i);
9499 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009500 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009501 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009502}
9503
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009504static Py_ssize_t
9505do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009506{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009507 Py_ssize_t i, k = 0;
9508 int n_res, j;
9509 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009510
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009511 c = PyUnicode_READ(kind, data, 0);
9512 n_res = _PyUnicode_ToUpperFull(c, mapped);
9513 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009514 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009515 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009516 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009517 for (i = 1; i < length; i++) {
9518 c = PyUnicode_READ(kind, data, i);
9519 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9520 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009521 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009522 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009523 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009524 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009525 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009526}
9527
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009528static Py_ssize_t
9529do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9530 Py_ssize_t i, k = 0;
9531
9532 for (i = 0; i < length; i++) {
9533 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9534 int n_res, j;
9535 if (Py_UNICODE_ISUPPER(c)) {
9536 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9537 }
9538 else if (Py_UNICODE_ISLOWER(c)) {
9539 n_res = _PyUnicode_ToUpperFull(c, mapped);
9540 }
9541 else {
9542 n_res = 1;
9543 mapped[0] = c;
9544 }
9545 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009546 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009547 res[k++] = mapped[j];
9548 }
9549 }
9550 return k;
9551}
9552
9553static Py_ssize_t
9554do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9555 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009556{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009557 Py_ssize_t i, k = 0;
9558
9559 for (i = 0; i < length; i++) {
9560 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9561 int n_res, j;
9562 if (lower)
9563 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9564 else
9565 n_res = _PyUnicode_ToUpperFull(c, mapped);
9566 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009567 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009568 res[k++] = mapped[j];
9569 }
9570 }
9571 return k;
9572}
9573
9574static Py_ssize_t
9575do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9576{
9577 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9578}
9579
9580static Py_ssize_t
9581do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9582{
9583 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9584}
9585
Benjamin Petersone51757f2012-01-12 21:10:29 -05009586static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009587do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9588{
9589 Py_ssize_t i, k = 0;
9590
9591 for (i = 0; i < length; i++) {
9592 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9593 Py_UCS4 mapped[3];
9594 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9595 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009596 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -05009597 res[k++] = mapped[j];
9598 }
9599 }
9600 return k;
9601}
9602
9603static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009604do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9605{
9606 Py_ssize_t i, k = 0;
9607 int previous_is_cased;
9608
9609 previous_is_cased = 0;
9610 for (i = 0; i < length; i++) {
9611 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9612 Py_UCS4 mapped[3];
9613 int n_res, j;
9614
9615 if (previous_is_cased)
9616 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9617 else
9618 n_res = _PyUnicode_ToTitleFull(c, mapped);
9619
9620 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009621 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -05009622 res[k++] = mapped[j];
9623 }
9624
9625 previous_is_cased = _PyUnicode_IsCased(c);
9626 }
9627 return k;
9628}
9629
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009630static PyObject *
9631case_operation(PyObject *self,
9632 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9633{
9634 PyObject *res = NULL;
9635 Py_ssize_t length, newlength = 0;
9636 int kind, outkind;
9637 void *data, *outdata;
9638 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9639
Benjamin Petersoneea48462012-01-16 14:28:50 -05009640 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009641
9642 kind = PyUnicode_KIND(self);
9643 data = PyUnicode_DATA(self);
9644 length = PyUnicode_GET_LENGTH(self);
9645 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
9646 if (tmp == NULL)
9647 return PyErr_NoMemory();
9648 newlength = perform(kind, data, length, tmp, &maxchar);
9649 res = PyUnicode_New(newlength, maxchar);
9650 if (res == NULL)
9651 goto leave;
9652 tmpend = tmp + newlength;
9653 outdata = PyUnicode_DATA(res);
9654 outkind = PyUnicode_KIND(res);
9655 switch (outkind) {
9656 case PyUnicode_1BYTE_KIND:
9657 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9658 break;
9659 case PyUnicode_2BYTE_KIND:
9660 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9661 break;
9662 case PyUnicode_4BYTE_KIND:
9663 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9664 break;
9665 default:
9666 assert(0);
9667 break;
9668 }
9669 leave:
9670 PyMem_FREE(tmp);
9671 return res;
9672}
9673
Tim Peters8ce9f162004-08-27 01:49:32 +00009674PyObject *
9675PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009676{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009677 PyObject *sep = NULL;
Victor Stinnerdd077322011-10-07 17:02:31 +02009678 Py_ssize_t seplen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009679 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00009680 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009681 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
9682 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00009683 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009684 Py_ssize_t sz, i, res_offset;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009685 Py_UCS4 maxchar;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009686 Py_UCS4 item_maxchar;
Victor Stinnerdd077322011-10-07 17:02:31 +02009687 int use_memcpy;
9688 unsigned char *res_data = NULL, *sep_data = NULL;
9689 PyObject *last_obj;
9690 unsigned int kind = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009691
Benjamin Peterson9743b2c2014-02-15 13:02:52 -05009692 fseq = PySequence_Fast(seq, "can only join an iterable");
Tim Peters05eba1f2004-08-27 21:32:02 +00009693 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009694 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009695 }
9696
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009697 /* NOTE: the following code can't call back into Python code,
9698 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009699 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009700
Tim Peters05eba1f2004-08-27 21:32:02 +00009701 seqlen = PySequence_Fast_GET_SIZE(fseq);
9702 /* If empty sequence, return u"". */
9703 if (seqlen == 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009704 Py_DECREF(fseq);
Serhiy Storchaka678db842013-01-26 12:16:36 +02009705 _Py_RETURN_UNICODE_EMPTY();
Tim Peters05eba1f2004-08-27 21:32:02 +00009706 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009707
Tim Peters05eba1f2004-08-27 21:32:02 +00009708 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009709 last_obj = NULL;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009710 items = PySequence_Fast_ITEMS(fseq);
Victor Stinneracf47b82011-10-06 12:32:37 +02009711 if (seqlen == 1) {
9712 if (PyUnicode_CheckExact(items[0])) {
9713 res = items[0];
9714 Py_INCREF(res);
9715 Py_DECREF(fseq);
9716 return res;
9717 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009718 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009719 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009720 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009721 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009722 /* Set up sep and seplen */
9723 if (separator == NULL) {
9724 /* fall back to a blank space separator */
9725 sep = PyUnicode_FromOrdinal(' ');
9726 if (!sep)
9727 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009728 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009729 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009730 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009731 else {
9732 if (!PyUnicode_Check(separator)) {
9733 PyErr_Format(PyExc_TypeError,
9734 "separator: expected str instance,"
9735 " %.80s found",
9736 Py_TYPE(separator)->tp_name);
9737 goto onError;
9738 }
9739 if (PyUnicode_READY(separator))
9740 goto onError;
9741 sep = separator;
9742 seplen = PyUnicode_GET_LENGTH(separator);
9743 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9744 /* inc refcount to keep this code path symmetric with the
9745 above case of a blank separator */
9746 Py_INCREF(sep);
9747 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009748 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009749 }
9750
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009751 /* There are at least two things to join, or else we have a subclass
9752 * of str in the sequence.
9753 * Do a pre-pass to figure out the total amount of space we'll
9754 * need (sz), and see whether all argument are strings.
9755 */
9756 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009757#ifdef Py_DEBUG
9758 use_memcpy = 0;
9759#else
9760 use_memcpy = 1;
9761#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009762 for (i = 0; i < seqlen; i++) {
9763 const Py_ssize_t old_sz = sz;
9764 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009765 if (!PyUnicode_Check(item)) {
9766 PyErr_Format(PyExc_TypeError,
9767 "sequence item %zd: expected str instance,"
9768 " %.80s found",
9769 i, Py_TYPE(item)->tp_name);
9770 goto onError;
9771 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009772 if (PyUnicode_READY(item) == -1)
9773 goto onError;
9774 sz += PyUnicode_GET_LENGTH(item);
9775 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Benjamin Peterson7e303732013-06-10 09:19:46 -07009776 maxchar = Py_MAX(maxchar, item_maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009777 if (i != 0)
9778 sz += seplen;
9779 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9780 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009781 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009782 goto onError;
9783 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009784 if (use_memcpy && last_obj != NULL) {
9785 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9786 use_memcpy = 0;
9787 }
9788 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009789 }
Tim Petersced69f82003-09-16 20:30:58 +00009790
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009791 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009792 if (res == NULL)
9793 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009794
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009795 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009796#ifdef Py_DEBUG
9797 use_memcpy = 0;
9798#else
9799 if (use_memcpy) {
9800 res_data = PyUnicode_1BYTE_DATA(res);
9801 kind = PyUnicode_KIND(res);
9802 if (seplen != 0)
9803 sep_data = PyUnicode_1BYTE_DATA(sep);
9804 }
9805#endif
Victor Stinner4560f9c2013-04-14 18:56:46 +02009806 if (use_memcpy) {
9807 for (i = 0; i < seqlen; ++i) {
9808 Py_ssize_t itemlen;
9809 item = items[i];
9810
9811 /* Copy item, and maybe the separator. */
9812 if (i && seplen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009813 Py_MEMCPY(res_data,
9814 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009815 kind * seplen);
9816 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009817 }
Victor Stinner4560f9c2013-04-14 18:56:46 +02009818
9819 itemlen = PyUnicode_GET_LENGTH(item);
9820 if (itemlen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009821 Py_MEMCPY(res_data,
9822 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009823 kind * itemlen);
9824 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009825 }
Victor Stinner4560f9c2013-04-14 18:56:46 +02009826 }
9827 assert(res_data == PyUnicode_1BYTE_DATA(res)
9828 + kind * PyUnicode_GET_LENGTH(res));
9829 }
9830 else {
9831 for (i = 0, res_offset = 0; i < seqlen; ++i) {
9832 Py_ssize_t itemlen;
9833 item = items[i];
9834
9835 /* Copy item, and maybe the separator. */
9836 if (i && seplen != 0) {
9837 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
9838 res_offset += seplen;
9839 }
9840
9841 itemlen = PyUnicode_GET_LENGTH(item);
9842 if (itemlen != 0) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009843 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +02009844 res_offset += itemlen;
9845 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009846 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009847 assert(res_offset == PyUnicode_GET_LENGTH(res));
Victor Stinner4560f9c2013-04-14 18:56:46 +02009848 }
Tim Peters8ce9f162004-08-27 01:49:32 +00009849
Tim Peters05eba1f2004-08-27 21:32:02 +00009850 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009851 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009852 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009853 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009854
Benjamin Peterson29060642009-01-31 22:14:21 +00009855 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00009856 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009857 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00009858 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009859 return NULL;
9860}
9861
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009862#define FILL(kind, data, value, start, length) \
9863 do { \
9864 Py_ssize_t i_ = 0; \
9865 assert(kind != PyUnicode_WCHAR_KIND); \
9866 switch ((kind)) { \
9867 case PyUnicode_1BYTE_KIND: { \
9868 unsigned char * to_ = (unsigned char *)((data)) + (start); \
Victor Stinnerf2c76aa2012-05-03 13:10:40 +02009869 memset(to_, (unsigned char)value, (length)); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009870 break; \
9871 } \
9872 case PyUnicode_2BYTE_KIND: { \
9873 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
9874 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9875 break; \
9876 } \
Benjamin Petersone157cf12012-01-01 15:56:20 -06009877 case PyUnicode_4BYTE_KIND: { \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009878 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9879 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9880 break; \
Benjamin Petersone157cf12012-01-01 15:56:20 -06009881 default: assert(0); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009882 } \
9883 } \
9884 } while (0)
9885
Victor Stinnerd3f08822012-05-29 12:57:52 +02009886void
9887_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9888 Py_UCS4 fill_char)
9889{
9890 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
9891 const void *data = PyUnicode_DATA(unicode);
9892 assert(PyUnicode_IS_READY(unicode));
9893 assert(unicode_modifiable(unicode));
9894 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
9895 assert(start >= 0);
9896 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
9897 FILL(kind, data, fill_char, start, length);
9898}
9899
Victor Stinner3fe55312012-01-04 00:33:50 +01009900Py_ssize_t
9901PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9902 Py_UCS4 fill_char)
9903{
9904 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +01009905
9906 if (!PyUnicode_Check(unicode)) {
9907 PyErr_BadInternalCall();
9908 return -1;
9909 }
9910 if (PyUnicode_READY(unicode) == -1)
9911 return -1;
9912 if (unicode_check_modifiable(unicode))
9913 return -1;
9914
Victor Stinnerd3f08822012-05-29 12:57:52 +02009915 if (start < 0) {
9916 PyErr_SetString(PyExc_IndexError, "string index out of range");
9917 return -1;
9918 }
Victor Stinner3fe55312012-01-04 00:33:50 +01009919 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
9920 PyErr_SetString(PyExc_ValueError,
9921 "fill character is bigger than "
9922 "the string maximum character");
9923 return -1;
9924 }
9925
9926 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
9927 length = Py_MIN(maxlen, length);
9928 if (length <= 0)
9929 return 0;
9930
Victor Stinnerd3f08822012-05-29 12:57:52 +02009931 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +01009932 return length;
9933}
9934
Victor Stinner9310abb2011-10-05 00:59:23 +02009935static PyObject *
9936pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009937 Py_ssize_t left,
9938 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009939 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009940{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009941 PyObject *u;
9942 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009943 int kind;
9944 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009945
9946 if (left < 0)
9947 left = 0;
9948 if (right < 0)
9949 right = 0;
9950
Victor Stinnerc4b49542011-12-11 22:44:26 +01009951 if (left == 0 && right == 0)
9952 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009953
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009954 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9955 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00009956 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9957 return NULL;
9958 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009959 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Benjamin Peterson7e303732013-06-10 09:19:46 -07009960 maxchar = Py_MAX(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009961 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009962 if (!u)
9963 return NULL;
9964
9965 kind = PyUnicode_KIND(u);
9966 data = PyUnicode_DATA(u);
9967 if (left)
9968 FILL(kind, data, fill, 0, left);
9969 if (right)
9970 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +02009971 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009972 assert(_PyUnicode_CheckConsistency(u, 1));
9973 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009974}
9975
Alexander Belopolsky40018472011-02-26 01:02:56 +00009976PyObject *
9977PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009978{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009979 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009980
9981 string = PyUnicode_FromObject(string);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009982 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009983 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -06009984 if (PyUnicode_READY(string) == -1) {
9985 Py_DECREF(string);
9986 return NULL;
9987 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009988
Benjamin Petersonead6b532011-12-20 17:23:42 -06009989 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009990 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009991 if (PyUnicode_IS_ASCII(string))
9992 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009993 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009994 PyUnicode_GET_LENGTH(string), keepends);
9995 else
9996 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009997 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009998 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009999 break;
10000 case PyUnicode_2BYTE_KIND:
10001 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010002 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010003 PyUnicode_GET_LENGTH(string), keepends);
10004 break;
10005 case PyUnicode_4BYTE_KIND:
10006 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010007 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010008 PyUnicode_GET_LENGTH(string), keepends);
10009 break;
10010 default:
10011 assert(0);
10012 list = 0;
10013 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010014 Py_DECREF(string);
10015 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010016}
10017
Alexander Belopolsky40018472011-02-26 01:02:56 +000010018static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010019split(PyObject *self,
10020 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010021 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010022{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010023 int kind1, kind2, kind;
10024 void *buf1, *buf2;
10025 Py_ssize_t len1, len2;
10026 PyObject* out;
10027
Guido van Rossumd57fd912000-03-10 22:53:23 +000010028 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010029 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010030
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010031 if (PyUnicode_READY(self) == -1)
10032 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010033
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010034 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010035 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010036 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010037 if (PyUnicode_IS_ASCII(self))
10038 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010039 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010040 PyUnicode_GET_LENGTH(self), maxcount
10041 );
10042 else
10043 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010044 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010045 PyUnicode_GET_LENGTH(self), maxcount
10046 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010047 case PyUnicode_2BYTE_KIND:
10048 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010049 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010050 PyUnicode_GET_LENGTH(self), maxcount
10051 );
10052 case PyUnicode_4BYTE_KIND:
10053 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010054 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010055 PyUnicode_GET_LENGTH(self), maxcount
10056 );
10057 default:
10058 assert(0);
10059 return NULL;
10060 }
10061
10062 if (PyUnicode_READY(substring) == -1)
10063 return NULL;
10064
10065 kind1 = PyUnicode_KIND(self);
10066 kind2 = PyUnicode_KIND(substring);
10067 kind = kind1 > kind2 ? kind1 : kind2;
10068 buf1 = PyUnicode_DATA(self);
10069 buf2 = PyUnicode_DATA(substring);
10070 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010071 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010072 if (!buf1)
10073 return NULL;
10074 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010075 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010076 if (!buf2) {
10077 if (kind1 != kind) PyMem_Free(buf1);
10078 return NULL;
10079 }
10080 len1 = PyUnicode_GET_LENGTH(self);
10081 len2 = PyUnicode_GET_LENGTH(substring);
10082
Benjamin Petersonead6b532011-12-20 17:23:42 -060010083 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010084 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010085 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10086 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010087 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010088 else
10089 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010090 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010091 break;
10092 case PyUnicode_2BYTE_KIND:
10093 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010094 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010095 break;
10096 case PyUnicode_4BYTE_KIND:
10097 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010098 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010099 break;
10100 default:
10101 out = NULL;
10102 }
10103 if (kind1 != kind)
10104 PyMem_Free(buf1);
10105 if (kind2 != kind)
10106 PyMem_Free(buf2);
10107 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010108}
10109
Alexander Belopolsky40018472011-02-26 01:02:56 +000010110static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010111rsplit(PyObject *self,
10112 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010113 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010114{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010115 int kind1, kind2, kind;
10116 void *buf1, *buf2;
10117 Py_ssize_t len1, len2;
10118 PyObject* out;
10119
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010120 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010121 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010122
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010123 if (PyUnicode_READY(self) == -1)
10124 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010125
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010126 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010127 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010128 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010129 if (PyUnicode_IS_ASCII(self))
10130 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010131 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010132 PyUnicode_GET_LENGTH(self), maxcount
10133 );
10134 else
10135 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010136 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010137 PyUnicode_GET_LENGTH(self), maxcount
10138 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010139 case PyUnicode_2BYTE_KIND:
10140 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010141 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010142 PyUnicode_GET_LENGTH(self), maxcount
10143 );
10144 case PyUnicode_4BYTE_KIND:
10145 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010146 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010147 PyUnicode_GET_LENGTH(self), maxcount
10148 );
10149 default:
10150 assert(0);
10151 return NULL;
10152 }
10153
10154 if (PyUnicode_READY(substring) == -1)
10155 return NULL;
10156
10157 kind1 = PyUnicode_KIND(self);
10158 kind2 = PyUnicode_KIND(substring);
10159 kind = kind1 > kind2 ? kind1 : kind2;
10160 buf1 = PyUnicode_DATA(self);
10161 buf2 = PyUnicode_DATA(substring);
10162 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010163 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010164 if (!buf1)
10165 return NULL;
10166 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010167 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010168 if (!buf2) {
10169 if (kind1 != kind) PyMem_Free(buf1);
10170 return NULL;
10171 }
10172 len1 = PyUnicode_GET_LENGTH(self);
10173 len2 = PyUnicode_GET_LENGTH(substring);
10174
Benjamin Petersonead6b532011-12-20 17:23:42 -060010175 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010176 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010177 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10178 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010179 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010180 else
10181 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010182 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010183 break;
10184 case PyUnicode_2BYTE_KIND:
10185 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010186 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010187 break;
10188 case PyUnicode_4BYTE_KIND:
10189 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010190 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010191 break;
10192 default:
10193 out = NULL;
10194 }
10195 if (kind1 != kind)
10196 PyMem_Free(buf1);
10197 if (kind2 != kind)
10198 PyMem_Free(buf2);
10199 return out;
10200}
10201
10202static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010203anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10204 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010205{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010206 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010207 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010208 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10209 return asciilib_find(buf1, len1, buf2, len2, offset);
10210 else
10211 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010212 case PyUnicode_2BYTE_KIND:
10213 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10214 case PyUnicode_4BYTE_KIND:
10215 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10216 }
10217 assert(0);
10218 return -1;
10219}
10220
10221static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010222anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10223 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010224{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010225 switch (kind) {
10226 case PyUnicode_1BYTE_KIND:
10227 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10228 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10229 else
10230 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10231 case PyUnicode_2BYTE_KIND:
10232 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10233 case PyUnicode_4BYTE_KIND:
10234 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10235 }
10236 assert(0);
10237 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010238}
10239
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010240static void
10241replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10242 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10243{
10244 int kind = PyUnicode_KIND(u);
10245 void *data = PyUnicode_DATA(u);
10246 Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10247 if (kind == PyUnicode_1BYTE_KIND) {
10248 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10249 (Py_UCS1 *)data + len,
10250 u1, u2, maxcount);
10251 }
10252 else if (kind == PyUnicode_2BYTE_KIND) {
10253 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10254 (Py_UCS2 *)data + len,
10255 u1, u2, maxcount);
10256 }
10257 else {
10258 assert(kind == PyUnicode_4BYTE_KIND);
10259 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10260 (Py_UCS4 *)data + len,
10261 u1, u2, maxcount);
10262 }
10263}
10264
Alexander Belopolsky40018472011-02-26 01:02:56 +000010265static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010266replace(PyObject *self, PyObject *str1,
10267 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010268{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010269 PyObject *u;
10270 char *sbuf = PyUnicode_DATA(self);
10271 char *buf1 = PyUnicode_DATA(str1);
10272 char *buf2 = PyUnicode_DATA(str2);
10273 int srelease = 0, release1 = 0, release2 = 0;
10274 int skind = PyUnicode_KIND(self);
10275 int kind1 = PyUnicode_KIND(str1);
10276 int kind2 = PyUnicode_KIND(str2);
10277 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10278 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10279 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010280 int mayshrink;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010281 Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010282
10283 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010284 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010285 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010286 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010287
Victor Stinner59de0ee2011-10-07 10:01:28 +020010288 if (str1 == str2)
10289 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010290
Victor Stinner49a0a212011-10-12 23:46:10 +020010291 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010292 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10293 if (maxchar < maxchar_str1)
10294 /* substring too wide to be present */
10295 goto nothing;
Victor Stinner49a0a212011-10-12 23:46:10 +020010296 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10297 /* Replacing str1 with str2 may cause a maxchar reduction in the
10298 result string. */
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010299 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010300 maxchar = Py_MAX(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010301
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010302 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010303 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010304 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010305 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010306 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010307 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010308 Py_UCS4 u1, u2;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010309 Py_ssize_t pos;
Victor Stinnerf6441102011-12-18 02:43:08 +010010310
Victor Stinner69ed0f42013-04-09 21:48:24 +020010311 u1 = PyUnicode_READ(kind1, buf1, 0);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010312 pos = findchar(sbuf, skind, slen, u1, 1);
Victor Stinnerf6441102011-12-18 02:43:08 +010010313 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010314 goto nothing;
Victor Stinner69ed0f42013-04-09 21:48:24 +020010315 u2 = PyUnicode_READ(kind2, buf2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010316 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010317 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010318 goto error;
Victor Stinnerf6441102011-12-18 02:43:08 +010010319
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010320 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10321 replace_1char_inplace(u, pos, u1, u2, maxcount);
Victor Stinner49a0a212011-10-12 23:46:10 +020010322 }
10323 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010324 int rkind = skind;
10325 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010326 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010327
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010328 if (kind1 < rkind) {
10329 /* widen substring */
10330 buf1 = _PyUnicode_AsKind(str1, rkind);
10331 if (!buf1) goto error;
10332 release1 = 1;
10333 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010334 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010335 if (i < 0)
10336 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010337 if (rkind > kind2) {
10338 /* widen replacement */
10339 buf2 = _PyUnicode_AsKind(str2, rkind);
10340 if (!buf2) goto error;
10341 release2 = 1;
10342 }
10343 else if (rkind < kind2) {
10344 /* widen self and buf1 */
10345 rkind = kind2;
10346 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010347 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010348 sbuf = _PyUnicode_AsKind(self, rkind);
10349 if (!sbuf) goto error;
10350 srelease = 1;
10351 buf1 = _PyUnicode_AsKind(str1, rkind);
10352 if (!buf1) goto error;
10353 release1 = 1;
10354 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010355 u = PyUnicode_New(slen, maxchar);
10356 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010357 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010358 assert(PyUnicode_KIND(u) == rkind);
10359 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010360
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010361 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010362 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010363 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010364 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010365 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010366 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010367
10368 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010369 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010370 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010371 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010372 if (i == -1)
10373 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010374 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010375 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010376 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010377 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010378 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010379 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010380 }
10381 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010382 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010383 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010384 int rkind = skind;
10385 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010386
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010387 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010388 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010389 buf1 = _PyUnicode_AsKind(str1, rkind);
10390 if (!buf1) goto error;
10391 release1 = 1;
10392 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010393 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010394 if (n == 0)
10395 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010396 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010397 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010398 buf2 = _PyUnicode_AsKind(str2, rkind);
10399 if (!buf2) goto error;
10400 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010401 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010402 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010403 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010404 rkind = kind2;
10405 sbuf = _PyUnicode_AsKind(self, rkind);
10406 if (!sbuf) goto error;
10407 srelease = 1;
10408 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010409 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010410 buf1 = _PyUnicode_AsKind(str1, rkind);
10411 if (!buf1) goto error;
10412 release1 = 1;
10413 }
10414 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10415 PyUnicode_GET_LENGTH(str1))); */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010416 if (len2 > len1 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010417 PyErr_SetString(PyExc_OverflowError,
10418 "replace string is too long");
10419 goto error;
10420 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010421 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010422 if (new_size == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010423 _Py_INCREF_UNICODE_EMPTY();
10424 if (!unicode_empty)
10425 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010426 u = unicode_empty;
10427 goto done;
10428 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010429 if (new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010430 PyErr_SetString(PyExc_OverflowError,
10431 "replace string is too long");
10432 goto error;
10433 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010434 u = PyUnicode_New(new_size, maxchar);
10435 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010436 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010437 assert(PyUnicode_KIND(u) == rkind);
10438 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010439 ires = i = 0;
10440 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010441 while (n-- > 0) {
10442 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010443 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010444 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010445 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010446 if (j == -1)
10447 break;
10448 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010449 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010450 memcpy(res + rkind * ires,
10451 sbuf + rkind * i,
10452 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010453 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010454 }
10455 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010456 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010457 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010458 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010459 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010460 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010461 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010462 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010463 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010464 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010465 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010466 memcpy(res + rkind * ires,
10467 sbuf + rkind * i,
10468 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010469 }
10470 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010471 /* interleave */
10472 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010473 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010474 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010475 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010476 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010477 if (--n <= 0)
10478 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010479 memcpy(res + rkind * ires,
10480 sbuf + rkind * i,
10481 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010482 ires++;
10483 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010484 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010485 memcpy(res + rkind * ires,
10486 sbuf + rkind * i,
10487 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010488 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010489 }
10490
10491 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010492 unicode_adjust_maxchar(&u);
10493 if (u == NULL)
10494 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010495 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010496
10497 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010498 if (srelease)
10499 PyMem_FREE(sbuf);
10500 if (release1)
10501 PyMem_FREE(buf1);
10502 if (release2)
10503 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010504 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010505 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010506
Benjamin Peterson29060642009-01-31 22:14:21 +000010507 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010508 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010509 if (srelease)
10510 PyMem_FREE(sbuf);
10511 if (release1)
10512 PyMem_FREE(buf1);
10513 if (release2)
10514 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010515 return unicode_result_unchanged(self);
10516
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010517 error:
10518 if (srelease && sbuf)
10519 PyMem_FREE(sbuf);
10520 if (release1 && buf1)
10521 PyMem_FREE(buf1);
10522 if (release2 && buf2)
10523 PyMem_FREE(buf2);
10524 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010525}
10526
10527/* --- Unicode Object Methods --------------------------------------------- */
10528
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010529PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010530 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010531\n\
10532Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010533characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010534
10535static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010536unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010537{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010538 if (PyUnicode_READY(self) == -1)
10539 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010540 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010541}
10542
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010543PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010544 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010545\n\
10546Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +000010547have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010548
10549static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010550unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010551{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010552 if (PyUnicode_READY(self) == -1)
10553 return NULL;
10554 if (PyUnicode_GET_LENGTH(self) == 0)
10555 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010556 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010557}
10558
Benjamin Petersond5890c82012-01-14 13:23:30 -050010559PyDoc_STRVAR(casefold__doc__,
10560 "S.casefold() -> str\n\
10561\n\
10562Return a version of S suitable for caseless comparisons.");
10563
10564static PyObject *
10565unicode_casefold(PyObject *self)
10566{
10567 if (PyUnicode_READY(self) == -1)
10568 return NULL;
10569 if (PyUnicode_IS_ASCII(self))
10570 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010571 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010572}
10573
10574
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010575/* Argument converter. Coerces to a single unicode character */
10576
10577static int
10578convert_uc(PyObject *obj, void *addr)
10579{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010580 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010581 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010582
Benjamin Peterson14339b62009-01-31 16:36:08 +000010583 uniobj = PyUnicode_FromObject(obj);
10584 if (uniobj == NULL) {
10585 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010586 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010587 return 0;
10588 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010589 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010590 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010591 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010592 Py_DECREF(uniobj);
10593 return 0;
10594 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010595 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010596 Py_DECREF(uniobj);
10597 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010598}
10599
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010600PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010601 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010602\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010603Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010604done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010605
10606static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010607unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010608{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010609 Py_ssize_t marg, left;
10610 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010611 Py_UCS4 fillchar = ' ';
10612
Victor Stinnere9a29352011-10-01 02:14:59 +020010613 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010614 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010615
Benjamin Petersonbac79492012-01-14 13:34:47 -050010616 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010617 return NULL;
10618
Victor Stinnerc4b49542011-12-11 22:44:26 +010010619 if (PyUnicode_GET_LENGTH(self) >= width)
10620 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010621
Victor Stinnerc4b49542011-12-11 22:44:26 +010010622 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010623 left = marg / 2 + (marg & width & 1);
10624
Victor Stinner9310abb2011-10-05 00:59:23 +020010625 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010626}
10627
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010628/* This function assumes that str1 and str2 are readied by the caller. */
10629
Marc-André Lemburge5034372000-08-08 08:04:29 +000010630static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010631unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010632{
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010633#define COMPARE(TYPE1, TYPE2) \
10634 do { \
10635 TYPE1* p1 = (TYPE1 *)data1; \
10636 TYPE2* p2 = (TYPE2 *)data2; \
10637 TYPE1* end = p1 + len; \
10638 Py_UCS4 c1, c2; \
10639 for (; p1 != end; p1++, p2++) { \
10640 c1 = *p1; \
10641 c2 = *p2; \
10642 if (c1 != c2) \
10643 return (c1 < c2) ? -1 : 1; \
10644 } \
10645 } \
10646 while (0)
10647
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010648 int kind1, kind2;
10649 void *data1, *data2;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010650 Py_ssize_t len1, len2, len;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010651
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010652 kind1 = PyUnicode_KIND(str1);
10653 kind2 = PyUnicode_KIND(str2);
10654 data1 = PyUnicode_DATA(str1);
10655 data2 = PyUnicode_DATA(str2);
10656 len1 = PyUnicode_GET_LENGTH(str1);
10657 len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner770e19e2012-10-04 22:59:45 +020010658 len = Py_MIN(len1, len2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010659
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010660 switch(kind1) {
10661 case PyUnicode_1BYTE_KIND:
10662 {
10663 switch(kind2) {
10664 case PyUnicode_1BYTE_KIND:
10665 {
10666 int cmp = memcmp(data1, data2, len);
10667 /* normalize result of memcmp() into the range [-1; 1] */
10668 if (cmp < 0)
10669 return -1;
10670 if (cmp > 0)
10671 return 1;
10672 break;
Victor Stinner770e19e2012-10-04 22:59:45 +020010673 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010674 case PyUnicode_2BYTE_KIND:
10675 COMPARE(Py_UCS1, Py_UCS2);
10676 break;
10677 case PyUnicode_4BYTE_KIND:
10678 COMPARE(Py_UCS1, Py_UCS4);
10679 break;
10680 default:
10681 assert(0);
10682 }
10683 break;
10684 }
10685 case PyUnicode_2BYTE_KIND:
10686 {
10687 switch(kind2) {
10688 case PyUnicode_1BYTE_KIND:
10689 COMPARE(Py_UCS2, Py_UCS1);
10690 break;
10691 case PyUnicode_2BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010692 {
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010693 COMPARE(Py_UCS2, Py_UCS2);
10694 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010695 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010696 case PyUnicode_4BYTE_KIND:
10697 COMPARE(Py_UCS2, Py_UCS4);
10698 break;
10699 default:
10700 assert(0);
10701 }
10702 break;
10703 }
10704 case PyUnicode_4BYTE_KIND:
10705 {
10706 switch(kind2) {
10707 case PyUnicode_1BYTE_KIND:
10708 COMPARE(Py_UCS4, Py_UCS1);
10709 break;
10710 case PyUnicode_2BYTE_KIND:
10711 COMPARE(Py_UCS4, Py_UCS2);
10712 break;
10713 case PyUnicode_4BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010714 {
10715#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
10716 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
10717 /* normalize result of wmemcmp() into the range [-1; 1] */
10718 if (cmp < 0)
10719 return -1;
10720 if (cmp > 0)
10721 return 1;
10722#else
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010723 COMPARE(Py_UCS4, Py_UCS4);
Victor Stinnercd777ea2013-04-08 22:43:44 +020010724#endif
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010725 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010726 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010727 default:
10728 assert(0);
10729 }
10730 break;
10731 }
10732 default:
10733 assert(0);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010734 }
10735
Victor Stinner770e19e2012-10-04 22:59:45 +020010736 if (len1 == len2)
10737 return 0;
10738 if (len1 < len2)
10739 return -1;
10740 else
10741 return 1;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010742
10743#undef COMPARE
Marc-André Lemburge5034372000-08-08 08:04:29 +000010744}
10745
Victor Stinnerc8bc5372013-11-04 11:08:10 +010010746Py_LOCAL(int)
Victor Stinnere5567ad2012-10-23 02:48:49 +020010747unicode_compare_eq(PyObject *str1, PyObject *str2)
10748{
10749 int kind;
10750 void *data1, *data2;
10751 Py_ssize_t len;
10752 int cmp;
10753
Victor Stinnere5567ad2012-10-23 02:48:49 +020010754 len = PyUnicode_GET_LENGTH(str1);
10755 if (PyUnicode_GET_LENGTH(str2) != len)
10756 return 0;
10757 kind = PyUnicode_KIND(str1);
10758 if (PyUnicode_KIND(str2) != kind)
10759 return 0;
10760 data1 = PyUnicode_DATA(str1);
10761 data2 = PyUnicode_DATA(str2);
10762
10763 cmp = memcmp(data1, data2, len * kind);
10764 return (cmp == 0);
10765}
10766
10767
Alexander Belopolsky40018472011-02-26 01:02:56 +000010768int
10769PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010770{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010771 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10772 if (PyUnicode_READY(left) == -1 ||
10773 PyUnicode_READY(right) == -1)
10774 return -1;
Victor Stinnerf0c7b2a2013-11-04 11:27:14 +010010775
10776 /* a string is equal to itself */
10777 if (left == right)
10778 return 0;
10779
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010780 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010781 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010782 PyErr_Format(PyExc_TypeError,
10783 "Can't compare %.100s and %.100s",
10784 left->ob_type->tp_name,
10785 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010786 return -1;
10787}
10788
Martin v. Löwis5b222132007-06-10 09:51:05 +000010789int
Victor Stinnerad14ccd2013-11-07 00:46:04 +010010790_PyUnicode_CompareWithId(PyObject *left, _Py_Identifier *right)
10791{
10792 PyObject *right_str = _PyUnicode_FromId(right); /* borrowed */
10793 if (right_str == NULL)
10794 return -1;
10795 return PyUnicode_Compare(left, right_str);
10796}
10797
10798int
Martin v. Löwis5b222132007-06-10 09:51:05 +000010799PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10800{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010801 Py_ssize_t i;
10802 int kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010803 Py_UCS4 chr;
10804
Victor Stinner910337b2011-10-03 03:20:16 +020010805 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010806 if (PyUnicode_READY(uni) == -1)
10807 return -1;
10808 kind = PyUnicode_KIND(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010010809 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnera6b9b072013-10-30 18:27:13 +010010810 const void *data = PyUnicode_1BYTE_DATA(uni);
Victor Stinnere1b15922013-11-03 13:53:12 +010010811 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010010812 size_t len, len2 = strlen(str);
10813 int cmp;
10814
10815 len = Py_MIN(len1, len2);
10816 cmp = memcmp(data, str, len);
Victor Stinner21ea21e2013-11-04 11:28:26 +010010817 if (cmp != 0) {
10818 if (cmp < 0)
10819 return -1;
10820 else
10821 return 1;
10822 }
Victor Stinner602f7cf2013-10-29 23:31:50 +010010823 if (len1 > len2)
10824 return 1; /* uni is longer */
10825 if (len2 > len1)
10826 return -1; /* str is longer */
10827 return 0;
10828 }
10829 else {
10830 void *data = PyUnicode_DATA(uni);
10831 /* Compare Unicode string and source character set string */
10832 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
10833 if (chr != str[i])
10834 return (chr < (unsigned char)(str[i])) ? -1 : 1;
10835 /* This check keeps Python strings that end in '\0' from comparing equal
10836 to C strings identical up to that point. */
10837 if (PyUnicode_GET_LENGTH(uni) != i || chr)
10838 return 1; /* uni is longer */
10839 if (str[i])
10840 return -1; /* str is longer */
10841 return 0;
10842 }
Martin v. Löwis5b222132007-06-10 09:51:05 +000010843}
10844
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010845
Benjamin Peterson29060642009-01-31 22:14:21 +000010846#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000010847 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010848
Alexander Belopolsky40018472011-02-26 01:02:56 +000010849PyObject *
10850PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010851{
10852 int result;
Victor Stinnere5567ad2012-10-23 02:48:49 +020010853 PyObject *v;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010854
Victor Stinnere5567ad2012-10-23 02:48:49 +020010855 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
10856 Py_RETURN_NOTIMPLEMENTED;
10857
10858 if (PyUnicode_READY(left) == -1 ||
10859 PyUnicode_READY(right) == -1)
10860 return NULL;
10861
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010010862 if (left == right) {
10863 switch (op) {
10864 case Py_EQ:
10865 case Py_LE:
10866 case Py_GE:
10867 /* a string is equal to itself */
10868 v = Py_True;
10869 break;
10870 case Py_NE:
10871 case Py_LT:
10872 case Py_GT:
10873 v = Py_False;
10874 break;
10875 default:
10876 PyErr_BadArgument();
10877 return NULL;
10878 }
10879 }
10880 else if (op == Py_EQ || op == Py_NE) {
Victor Stinnere5567ad2012-10-23 02:48:49 +020010881 result = unicode_compare_eq(left, right);
Victor Stinnerc8bc5372013-11-04 11:08:10 +010010882 result ^= (op == Py_NE);
10883 v = TEST_COND(result);
Victor Stinnere5567ad2012-10-23 02:48:49 +020010884 }
10885 else {
Victor Stinner90db9c42012-10-04 21:53:50 +020010886 result = unicode_compare(left, right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010887
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010888 /* Convert the return value to a Boolean */
10889 switch (op) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010890 case Py_LE:
10891 v = TEST_COND(result <= 0);
10892 break;
10893 case Py_GE:
10894 v = TEST_COND(result >= 0);
10895 break;
10896 case Py_LT:
10897 v = TEST_COND(result == -1);
10898 break;
10899 case Py_GT:
10900 v = TEST_COND(result == 1);
10901 break;
10902 default:
10903 PyErr_BadArgument();
10904 return NULL;
10905 }
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010906 }
Victor Stinnere5567ad2012-10-23 02:48:49 +020010907 Py_INCREF(v);
10908 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010909}
10910
Alexander Belopolsky40018472011-02-26 01:02:56 +000010911int
10912PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +000010913{
Thomas Wouters477c8d52006-05-27 19:21:47 +000010914 PyObject *str, *sub;
Victor Stinner77282cb2013-04-14 19:22:47 +020010915 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010916 void *buf1, *buf2;
10917 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010918 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010919
10920 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +000010921 sub = PyUnicode_FromObject(element);
10922 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010923 PyErr_Format(PyExc_TypeError,
10924 "'in <string>' requires string as left operand, not %s",
10925 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010926 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010927 }
10928
Thomas Wouters477c8d52006-05-27 19:21:47 +000010929 str = PyUnicode_FromObject(container);
Benjamin Peterson22a29702012-01-02 09:00:30 -060010930 if (!str) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010931 Py_DECREF(sub);
10932 return -1;
10933 }
10934
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010935 kind1 = PyUnicode_KIND(str);
10936 kind2 = PyUnicode_KIND(sub);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010937 buf1 = PyUnicode_DATA(str);
10938 buf2 = PyUnicode_DATA(sub);
Victor Stinner77282cb2013-04-14 19:22:47 +020010939 if (kind2 != kind1) {
10940 if (kind2 > kind1) {
Antoine Pitrou758153b2012-05-12 15:51:51 +020010941 Py_DECREF(sub);
10942 Py_DECREF(str);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010943 return 0;
Antoine Pitrou758153b2012-05-12 15:51:51 +020010944 }
Victor Stinner77282cb2013-04-14 19:22:47 +020010945 buf2 = _PyUnicode_AsKind(sub, kind1);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010946 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010947 if (!buf2) {
10948 Py_DECREF(sub);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010949 Py_DECREF(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010950 return -1;
10951 }
10952 len1 = PyUnicode_GET_LENGTH(str);
10953 len2 = PyUnicode_GET_LENGTH(sub);
10954
Victor Stinner77282cb2013-04-14 19:22:47 +020010955 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010956 case PyUnicode_1BYTE_KIND:
10957 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10958 break;
10959 case PyUnicode_2BYTE_KIND:
10960 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10961 break;
10962 case PyUnicode_4BYTE_KIND:
10963 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
10964 break;
10965 default:
10966 result = -1;
10967 assert(0);
10968 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010969
10970 Py_DECREF(str);
10971 Py_DECREF(sub);
10972
Victor Stinner77282cb2013-04-14 19:22:47 +020010973 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010974 PyMem_Free(buf2);
10975
Guido van Rossum403d68b2000-03-13 15:55:09 +000010976 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010977}
10978
Guido van Rossumd57fd912000-03-10 22:53:23 +000010979/* Concat to string or Unicode object giving a new Unicode object. */
10980
Alexander Belopolsky40018472011-02-26 01:02:56 +000010981PyObject *
10982PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010983{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010984 PyObject *u = NULL, *v = NULL, *w;
Victor Stinner127226b2011-10-13 01:12:34 +020010985 Py_UCS4 maxchar, maxchar2;
Victor Stinner488fa492011-12-12 00:01:39 +010010986 Py_ssize_t u_len, v_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010987
10988 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010989 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010990 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010991 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010992 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010993 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010994 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010995
10996 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +020010997 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010998 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010999 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011000 }
Victor Stinnera464fc12011-10-02 20:39:30 +020011001 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011002 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011003 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011004 }
11005
Victor Stinner488fa492011-12-12 00:01:39 +010011006 u_len = PyUnicode_GET_LENGTH(u);
11007 v_len = PyUnicode_GET_LENGTH(v);
11008 if (u_len > PY_SSIZE_T_MAX - v_len) {
11009 PyErr_SetString(PyExc_OverflowError,
11010 "strings are too large to concat");
11011 goto onError;
11012 }
11013 new_len = u_len + v_len;
11014
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011015 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinner127226b2011-10-13 01:12:34 +020011016 maxchar2 = PyUnicode_MAX_CHAR_VALUE(v);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011017 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011018
Guido van Rossumd57fd912000-03-10 22:53:23 +000011019 /* Concat the two Unicode strings */
Victor Stinner488fa492011-12-12 00:01:39 +010011020 w = PyUnicode_New(new_len, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011021 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000011022 goto onError;
Victor Stinnerd3f08822012-05-29 12:57:52 +020011023 _PyUnicode_FastCopyCharacters(w, 0, u, 0, u_len);
11024 _PyUnicode_FastCopyCharacters(w, u_len, v, 0, v_len);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011025 Py_DECREF(u);
11026 Py_DECREF(v);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011027 assert(_PyUnicode_CheckConsistency(w, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011028 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011029
Benjamin Peterson29060642009-01-31 22:14:21 +000011030 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000011031 Py_XDECREF(u);
11032 Py_XDECREF(v);
11033 return NULL;
11034}
11035
Walter Dörwald1ab83302007-05-18 17:15:44 +000011036void
Victor Stinner23e56682011-10-03 03:54:37 +020011037PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000011038{
Victor Stinner23e56682011-10-03 03:54:37 +020011039 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010011040 Py_UCS4 maxchar, maxchar2;
11041 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020011042
11043 if (p_left == NULL) {
11044 if (!PyErr_Occurred())
11045 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000011046 return;
11047 }
Victor Stinner23e56682011-10-03 03:54:37 +020011048 left = *p_left;
Victor Stinnerf0335102013-04-14 19:13:03 +020011049 if (right == NULL || left == NULL
11050 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
Victor Stinner23e56682011-10-03 03:54:37 +020011051 if (!PyErr_Occurred())
11052 PyErr_BadInternalCall();
11053 goto error;
11054 }
11055
Benjamin Petersonbac79492012-01-14 13:34:47 -050011056 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011057 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050011058 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011059 goto error;
11060
Victor Stinner488fa492011-12-12 00:01:39 +010011061 /* Shortcuts */
11062 if (left == unicode_empty) {
11063 Py_DECREF(left);
11064 Py_INCREF(right);
11065 *p_left = right;
11066 return;
11067 }
11068 if (right == unicode_empty)
11069 return;
11070
11071 left_len = PyUnicode_GET_LENGTH(left);
11072 right_len = PyUnicode_GET_LENGTH(right);
11073 if (left_len > PY_SSIZE_T_MAX - right_len) {
11074 PyErr_SetString(PyExc_OverflowError,
11075 "strings are too large to concat");
11076 goto error;
11077 }
11078 new_len = left_len + right_len;
11079
11080 if (unicode_modifiable(left)
11081 && PyUnicode_CheckExact(right)
11082 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020011083 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11084 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020011085 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020011086 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010011087 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11088 {
11089 /* append inplace */
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011090 if (unicode_resize(p_left, new_len) != 0)
Victor Stinner488fa492011-12-12 00:01:39 +010011091 goto error;
Victor Stinnerf0335102013-04-14 19:13:03 +020011092
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011093 /* copy 'right' into the newly allocated area of 'left' */
11094 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020011095 }
Victor Stinner488fa492011-12-12 00:01:39 +010011096 else {
11097 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11098 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011099 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020011100
Victor Stinner488fa492011-12-12 00:01:39 +010011101 /* Concat the two Unicode strings */
11102 res = PyUnicode_New(new_len, maxchar);
11103 if (res == NULL)
11104 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020011105 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11106 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010011107 Py_DECREF(left);
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011108 *p_left = res;
Victor Stinner488fa492011-12-12 00:01:39 +010011109 }
11110 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020011111 return;
11112
11113error:
Victor Stinner488fa492011-12-12 00:01:39 +010011114 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011115}
11116
11117void
11118PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11119{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011120 PyUnicode_Append(pleft, right);
11121 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011122}
11123
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011124PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011125 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011126\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011127Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011128string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011129interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011130
11131static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011132unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011133{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011134 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011135 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011136 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011137 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011138 int kind1, kind2, kind;
11139 void *buf1, *buf2;
11140 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011141
Jesus Ceaac451502011-04-20 17:09:23 +020011142 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
11143 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011144 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000011145
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011146 kind1 = PyUnicode_KIND(self);
11147 kind2 = PyUnicode_KIND(substring);
Christian Heimesd47802e2013-06-29 21:33:36 +020011148 if (kind2 > kind1) {
11149 Py_DECREF(substring);
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040011150 return PyLong_FromLong(0);
Christian Heimesd47802e2013-06-29 21:33:36 +020011151 }
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040011152 kind = kind1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011153 buf1 = PyUnicode_DATA(self);
11154 buf2 = PyUnicode_DATA(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011155 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010011156 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011157 if (!buf2) {
11158 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011159 return NULL;
11160 }
11161 len1 = PyUnicode_GET_LENGTH(self);
11162 len2 = PyUnicode_GET_LENGTH(substring);
11163
11164 ADJUST_INDICES(start, end, len1);
Benjamin Petersonead6b532011-12-20 17:23:42 -060011165 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011166 case PyUnicode_1BYTE_KIND:
11167 iresult = ucs1lib_count(
11168 ((Py_UCS1*)buf1) + start, end - start,
11169 buf2, len2, PY_SSIZE_T_MAX
11170 );
11171 break;
11172 case PyUnicode_2BYTE_KIND:
11173 iresult = ucs2lib_count(
11174 ((Py_UCS2*)buf1) + start, end - start,
11175 buf2, len2, PY_SSIZE_T_MAX
11176 );
11177 break;
11178 case PyUnicode_4BYTE_KIND:
11179 iresult = ucs4lib_count(
11180 ((Py_UCS4*)buf1) + start, end - start,
11181 buf2, len2, PY_SSIZE_T_MAX
11182 );
11183 break;
11184 default:
11185 assert(0); iresult = 0;
11186 }
11187
11188 result = PyLong_FromSsize_t(iresult);
11189
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011190 if (kind2 != kind)
11191 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011192
11193 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011194
Guido van Rossumd57fd912000-03-10 22:53:23 +000011195 return result;
11196}
11197
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011198PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000011199 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011200\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000011201Encode S using the codec registered for encoding. Default encoding\n\
11202is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000011203handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000011204a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
11205'xmlcharrefreplace' as well as any other name registered with\n\
11206codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011207
11208static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011209unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011210{
Benjamin Peterson308d6372009-09-18 21:42:35 +000011211 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000011212 char *encoding = NULL;
11213 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000011214
Benjamin Peterson308d6372009-09-18 21:42:35 +000011215 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
11216 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011217 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011218 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011219}
11220
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011221PyDoc_STRVAR(expandtabs__doc__,
Ezio Melotti745d54d2013-11-16 19:10:57 +020011222 "S.expandtabs(tabsize=8) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011223\n\
11224Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011225If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011226
11227static PyObject*
Ezio Melotti745d54d2013-11-16 19:10:57 +020011228unicode_expandtabs(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011229{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011230 Py_ssize_t i, j, line_pos, src_len, incr;
11231 Py_UCS4 ch;
11232 PyObject *u;
11233 void *src_data, *dest_data;
Ezio Melotti745d54d2013-11-16 19:10:57 +020011234 static char *kwlist[] = {"tabsize", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000011235 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011236 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011237 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011238
Ezio Melotti745d54d2013-11-16 19:10:57 +020011239 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:expandtabs",
11240 kwlist, &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000011241 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011242
Antoine Pitrou22425222011-10-04 19:10:51 +020011243 if (PyUnicode_READY(self) == -1)
11244 return NULL;
11245
Thomas Wouters7e474022000-07-16 12:04:32 +000011246 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011247 src_len = PyUnicode_GET_LENGTH(self);
11248 i = j = line_pos = 0;
11249 kind = PyUnicode_KIND(self);
11250 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011251 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011252 for (; i < src_len; i++) {
11253 ch = PyUnicode_READ(kind, src_data, i);
11254 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011255 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011256 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011257 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011258 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011259 goto overflow;
11260 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011261 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011262 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011263 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011264 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011265 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011266 goto overflow;
11267 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011268 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011269 if (ch == '\n' || ch == '\r')
11270 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011271 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011272 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011273 if (!found)
11274 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011275
Guido van Rossumd57fd912000-03-10 22:53:23 +000011276 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011277 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011278 if (!u)
11279 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011280 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011281
Antoine Pitroue71d5742011-10-04 15:55:09 +020011282 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011283
Antoine Pitroue71d5742011-10-04 15:55:09 +020011284 for (; i < src_len; i++) {
11285 ch = PyUnicode_READ(kind, src_data, i);
11286 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011287 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011288 incr = tabsize - (line_pos % tabsize);
11289 line_pos += incr;
Victor Stinnerda79e632012-02-22 13:37:04 +010011290 FILL(kind, dest_data, ' ', j, incr);
11291 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011292 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011293 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011294 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011295 line_pos++;
11296 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011297 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011298 if (ch == '\n' || ch == '\r')
11299 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011300 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011301 }
11302 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011303 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011304
Antoine Pitroue71d5742011-10-04 15:55:09 +020011305 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011306 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11307 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011308}
11309
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011310PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011311 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011312\n\
11313Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011314such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011315arguments start and end are interpreted as in slice notation.\n\
11316\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011317Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011318
11319static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011320unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011321{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011322 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011323 Py_ssize_t start;
11324 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011325 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011326
Jesus Ceaac451502011-04-20 17:09:23 +020011327 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
11328 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011329 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011330
Christian Heimesd47802e2013-06-29 21:33:36 +020011331 if (PyUnicode_READY(self) == -1) {
11332 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011333 return NULL;
Christian Heimesd47802e2013-06-29 21:33:36 +020011334 }
11335 if (PyUnicode_READY(substring) == -1) {
11336 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011337 return NULL;
Christian Heimesd47802e2013-06-29 21:33:36 +020011338 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011339
Victor Stinner7931d9a2011-11-04 00:22:48 +010011340 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011341
11342 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011343
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011344 if (result == -2)
11345 return NULL;
11346
Christian Heimes217cfd12007-12-02 14:31:20 +000011347 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011348}
11349
11350static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011351unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011352{
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011353 void *data;
11354 enum PyUnicode_Kind kind;
11355 Py_UCS4 ch;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011356
11357 if (!PyUnicode_Check(self) || PyUnicode_READY(self) == -1) {
11358 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011359 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011360 }
11361 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11362 PyErr_SetString(PyExc_IndexError, "string index out of range");
11363 return NULL;
11364 }
11365 kind = PyUnicode_KIND(self);
11366 data = PyUnicode_DATA(self);
11367 ch = PyUnicode_READ(kind, data, index);
Victor Stinner985a82a2014-01-03 12:53:47 +010011368 return unicode_char(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011369}
11370
Guido van Rossumc2504932007-09-18 19:42:40 +000011371/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011372 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011373static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011374unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011375{
Guido van Rossumc2504932007-09-18 19:42:40 +000011376 Py_ssize_t len;
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080011377 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000011378
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011379#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011380 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011381#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011382 if (_PyUnicode_HASH(self) != -1)
11383 return _PyUnicode_HASH(self);
11384 if (PyUnicode_READY(self) == -1)
11385 return -1;
11386 len = PyUnicode_GET_LENGTH(self);
Georg Brandl16fa2a12012-02-21 00:50:13 +010011387 /*
11388 We make the hash of the empty string be 0, rather than using
11389 (prefix ^ suffix), since this slightly obfuscates the hash secret
11390 */
11391 if (len == 0) {
11392 _PyUnicode_HASH(self) = 0;
11393 return 0;
11394 }
Christian Heimes985ecdc2013-11-20 11:46:18 +010011395 x = _Py_HashBytes(PyUnicode_DATA(self),
11396 PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011397 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011398 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011399}
11400
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011401PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011402 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011403\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011404Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011405
11406static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011407unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011408{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011409 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011410 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011411 Py_ssize_t start;
11412 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011413
Jesus Ceaac451502011-04-20 17:09:23 +020011414 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
11415 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011416 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011417
Christian Heimesd47a0452013-06-29 21:21:37 +020011418 if (PyUnicode_READY(self) == -1) {
11419 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011420 return NULL;
Christian Heimesd47a0452013-06-29 21:21:37 +020011421 }
11422 if (PyUnicode_READY(substring) == -1) {
11423 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011424 return NULL;
Christian Heimesd47a0452013-06-29 21:21:37 +020011425 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011426
Victor Stinner7931d9a2011-11-04 00:22:48 +010011427 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011428
11429 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011430
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011431 if (result == -2)
11432 return NULL;
11433
Guido van Rossumd57fd912000-03-10 22:53:23 +000011434 if (result < 0) {
11435 PyErr_SetString(PyExc_ValueError, "substring not found");
11436 return NULL;
11437 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011438
Christian Heimes217cfd12007-12-02 14:31:20 +000011439 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011440}
11441
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011442PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011443 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011444\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011445Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011446at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011447
11448static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011449unicode_islower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011450{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011451 Py_ssize_t i, length;
11452 int kind;
11453 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011454 int cased;
11455
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011456 if (PyUnicode_READY(self) == -1)
11457 return NULL;
11458 length = PyUnicode_GET_LENGTH(self);
11459 kind = PyUnicode_KIND(self);
11460 data = PyUnicode_DATA(self);
11461
Guido van Rossumd57fd912000-03-10 22:53:23 +000011462 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011463 if (length == 1)
11464 return PyBool_FromLong(
11465 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011466
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011467 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011468 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011469 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011470
Guido van Rossumd57fd912000-03-10 22:53:23 +000011471 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011472 for (i = 0; i < length; i++) {
11473 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011474
Benjamin Peterson29060642009-01-31 22:14:21 +000011475 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11476 return PyBool_FromLong(0);
11477 else if (!cased && Py_UNICODE_ISLOWER(ch))
11478 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011479 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011480 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011481}
11482
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011483PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011484 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011485\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011486Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011487at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011488
11489static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011490unicode_isupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011491{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011492 Py_ssize_t i, length;
11493 int kind;
11494 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011495 int cased;
11496
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011497 if (PyUnicode_READY(self) == -1)
11498 return NULL;
11499 length = PyUnicode_GET_LENGTH(self);
11500 kind = PyUnicode_KIND(self);
11501 data = PyUnicode_DATA(self);
11502
Guido van Rossumd57fd912000-03-10 22:53:23 +000011503 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011504 if (length == 1)
11505 return PyBool_FromLong(
11506 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011507
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011508 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011509 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011510 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011511
Guido van Rossumd57fd912000-03-10 22:53:23 +000011512 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011513 for (i = 0; i < length; i++) {
11514 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011515
Benjamin Peterson29060642009-01-31 22:14:21 +000011516 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11517 return PyBool_FromLong(0);
11518 else if (!cased && Py_UNICODE_ISUPPER(ch))
11519 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011520 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011521 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011522}
11523
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011524PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011525 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011526\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011527Return True if S is a titlecased string and there is at least one\n\
11528character in S, i.e. upper- and titlecase characters may only\n\
11529follow uncased characters and lowercase characters only cased ones.\n\
11530Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011531
11532static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011533unicode_istitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011534{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011535 Py_ssize_t i, length;
11536 int kind;
11537 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011538 int cased, previous_is_cased;
11539
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011540 if (PyUnicode_READY(self) == -1)
11541 return NULL;
11542 length = PyUnicode_GET_LENGTH(self);
11543 kind = PyUnicode_KIND(self);
11544 data = PyUnicode_DATA(self);
11545
Guido van Rossumd57fd912000-03-10 22:53:23 +000011546 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011547 if (length == 1) {
11548 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11549 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11550 (Py_UNICODE_ISUPPER(ch) != 0));
11551 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011552
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011553 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011554 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011555 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011556
Guido van Rossumd57fd912000-03-10 22:53:23 +000011557 cased = 0;
11558 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011559 for (i = 0; i < length; i++) {
11560 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011561
Benjamin Peterson29060642009-01-31 22:14:21 +000011562 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11563 if (previous_is_cased)
11564 return PyBool_FromLong(0);
11565 previous_is_cased = 1;
11566 cased = 1;
11567 }
11568 else if (Py_UNICODE_ISLOWER(ch)) {
11569 if (!previous_is_cased)
11570 return PyBool_FromLong(0);
11571 previous_is_cased = 1;
11572 cased = 1;
11573 }
11574 else
11575 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011576 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011577 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011578}
11579
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011580PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011581 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011582\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011583Return True if all characters in S are whitespace\n\
11584and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011585
11586static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011587unicode_isspace(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011588{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011589 Py_ssize_t i, length;
11590 int kind;
11591 void *data;
11592
11593 if (PyUnicode_READY(self) == -1)
11594 return NULL;
11595 length = PyUnicode_GET_LENGTH(self);
11596 kind = PyUnicode_KIND(self);
11597 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011598
Guido van Rossumd57fd912000-03-10 22:53:23 +000011599 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011600 if (length == 1)
11601 return PyBool_FromLong(
11602 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011603
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011604 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011605 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011606 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011607
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011608 for (i = 0; i < length; i++) {
11609 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011610 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011611 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011612 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011613 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011614}
11615
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011616PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011617 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011618\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011619Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011620and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011621
11622static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011623unicode_isalpha(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011624{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011625 Py_ssize_t i, length;
11626 int kind;
11627 void *data;
11628
11629 if (PyUnicode_READY(self) == -1)
11630 return NULL;
11631 length = PyUnicode_GET_LENGTH(self);
11632 kind = PyUnicode_KIND(self);
11633 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011634
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011635 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011636 if (length == 1)
11637 return PyBool_FromLong(
11638 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011639
11640 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011641 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011642 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011643
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011644 for (i = 0; i < length; i++) {
11645 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011646 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011647 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011648 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011649}
11650
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011651PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011652 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011653\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011654Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011655and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011656
11657static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011658unicode_isalnum(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011659{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011660 int kind;
11661 void *data;
11662 Py_ssize_t len, i;
11663
11664 if (PyUnicode_READY(self) == -1)
11665 return NULL;
11666
11667 kind = PyUnicode_KIND(self);
11668 data = PyUnicode_DATA(self);
11669 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011670
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011671 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011672 if (len == 1) {
11673 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11674 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11675 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011676
11677 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011678 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011679 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011680
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011681 for (i = 0; i < len; i++) {
11682 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011683 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011684 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011685 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011686 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011687}
11688
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011689PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011690 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011691\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011692Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011693False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011694
11695static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011696unicode_isdecimal(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011697{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011698 Py_ssize_t i, length;
11699 int kind;
11700 void *data;
11701
11702 if (PyUnicode_READY(self) == -1)
11703 return NULL;
11704 length = PyUnicode_GET_LENGTH(self);
11705 kind = PyUnicode_KIND(self);
11706 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011707
Guido van Rossumd57fd912000-03-10 22:53:23 +000011708 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011709 if (length == 1)
11710 return PyBool_FromLong(
11711 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011712
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011713 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011714 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011715 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011716
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011717 for (i = 0; i < length; i++) {
11718 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011719 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011720 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011721 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011722}
11723
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011724PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011725 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011726\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011727Return True if all characters in S are digits\n\
11728and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011729
11730static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011731unicode_isdigit(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011732{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011733 Py_ssize_t i, length;
11734 int kind;
11735 void *data;
11736
11737 if (PyUnicode_READY(self) == -1)
11738 return NULL;
11739 length = PyUnicode_GET_LENGTH(self);
11740 kind = PyUnicode_KIND(self);
11741 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011742
Guido van Rossumd57fd912000-03-10 22:53:23 +000011743 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011744 if (length == 1) {
11745 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11746 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11747 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011748
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011749 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011750 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011751 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011752
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011753 for (i = 0; i < length; i++) {
11754 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011755 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011756 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011757 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011758}
11759
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011760PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011761 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011762\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011763Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011764False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011765
11766static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011767unicode_isnumeric(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011768{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011769 Py_ssize_t i, length;
11770 int kind;
11771 void *data;
11772
11773 if (PyUnicode_READY(self) == -1)
11774 return NULL;
11775 length = PyUnicode_GET_LENGTH(self);
11776 kind = PyUnicode_KIND(self);
11777 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011778
Guido van Rossumd57fd912000-03-10 22:53:23 +000011779 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011780 if (length == 1)
11781 return PyBool_FromLong(
11782 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011783
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011784 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011785 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011786 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011787
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011788 for (i = 0; i < length; i++) {
11789 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011790 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011791 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011792 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011793}
11794
Martin v. Löwis47383402007-08-15 07:32:56 +000011795int
11796PyUnicode_IsIdentifier(PyObject *self)
11797{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011798 int kind;
11799 void *data;
11800 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011801 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000011802
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011803 if (PyUnicode_READY(self) == -1) {
11804 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000011805 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011806 }
11807
11808 /* Special case for empty strings */
11809 if (PyUnicode_GET_LENGTH(self) == 0)
11810 return 0;
11811 kind = PyUnicode_KIND(self);
11812 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000011813
11814 /* PEP 3131 says that the first character must be in
11815 XID_Start and subsequent characters in XID_Continue,
11816 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000011817 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000011818 letters, digits, underscore). However, given the current
11819 definition of XID_Start and XID_Continue, it is sufficient
11820 to check just for these, except that _ must be allowed
11821 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011822 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050011823 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000011824 return 0;
11825
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040011826 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011827 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011828 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000011829 return 1;
11830}
11831
11832PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011833 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000011834\n\
11835Return True if S is a valid identifier according\n\
Raymond Hettinger378170d2013-03-23 08:21:12 -070011836to the language definition.\n\
11837\n\
11838Use keyword.iskeyword() to test for reserved identifiers\n\
11839such as \"def\" and \"class\".\n");
Martin v. Löwis47383402007-08-15 07:32:56 +000011840
11841static PyObject*
11842unicode_isidentifier(PyObject *self)
11843{
11844 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11845}
11846
Georg Brandl559e5d72008-06-11 18:37:52 +000011847PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011848 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000011849\n\
11850Return True if all characters in S are considered\n\
11851printable in repr() or S is empty, False otherwise.");
11852
11853static PyObject*
11854unicode_isprintable(PyObject *self)
11855{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011856 Py_ssize_t i, length;
11857 int kind;
11858 void *data;
11859
11860 if (PyUnicode_READY(self) == -1)
11861 return NULL;
11862 length = PyUnicode_GET_LENGTH(self);
11863 kind = PyUnicode_KIND(self);
11864 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000011865
11866 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011867 if (length == 1)
11868 return PyBool_FromLong(
11869 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000011870
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011871 for (i = 0; i < length; i++) {
11872 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011873 Py_RETURN_FALSE;
11874 }
11875 }
11876 Py_RETURN_TRUE;
11877}
11878
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011879PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000011880 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011881\n\
11882Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000011883iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011884
11885static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011886unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011887{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011888 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011889}
11890
Martin v. Löwis18e16552006-02-15 17:27:45 +000011891static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011892unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011893{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011894 if (PyUnicode_READY(self) == -1)
11895 return -1;
11896 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011897}
11898
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011899PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011900 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011901\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011902Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011903done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011904
11905static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011906unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011907{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011908 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011909 Py_UCS4 fillchar = ' ';
11910
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011911 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011912 return NULL;
11913
Benjamin Petersonbac79492012-01-14 13:34:47 -050011914 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010011915 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011916
Victor Stinnerc4b49542011-12-11 22:44:26 +010011917 if (PyUnicode_GET_LENGTH(self) >= width)
11918 return unicode_result_unchanged(self);
11919
11920 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011921}
11922
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011923PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011924 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011925\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011926Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011927
11928static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011929unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011930{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050011931 if (PyUnicode_READY(self) == -1)
11932 return NULL;
11933 if (PyUnicode_IS_ASCII(self))
11934 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010011935 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011936}
11937
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011938#define LEFTSTRIP 0
11939#define RIGHTSTRIP 1
11940#define BOTHSTRIP 2
11941
11942/* Arrays indexed by above */
11943static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
11944
11945#define STRIPNAME(i) (stripformat[i]+3)
11946
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011947/* externally visible for str.strip(unicode) */
11948PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011949_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011950{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011951 void *data;
11952 int kind;
11953 Py_ssize_t i, j, len;
11954 BLOOM_MASK sepmask;
Victor Stinnerb3a60142013-04-09 22:19:21 +020011955 Py_ssize_t seplen;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011956
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011957 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
11958 return NULL;
11959
11960 kind = PyUnicode_KIND(self);
11961 data = PyUnicode_DATA(self);
11962 len = PyUnicode_GET_LENGTH(self);
Victor Stinnerb3a60142013-04-09 22:19:21 +020011963 seplen = PyUnicode_GET_LENGTH(sepobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011964 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11965 PyUnicode_DATA(sepobj),
Victor Stinnerb3a60142013-04-09 22:19:21 +020011966 seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011967
Benjamin Peterson14339b62009-01-31 16:36:08 +000011968 i = 0;
11969 if (striptype != RIGHTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020011970 while (i < len) {
11971 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11972 if (!BLOOM(sepmask, ch))
11973 break;
11974 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
11975 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000011976 i++;
11977 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011978 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011979
Benjamin Peterson14339b62009-01-31 16:36:08 +000011980 j = len;
11981 if (striptype != LEFTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020011982 j--;
11983 while (j >= i) {
11984 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
11985 if (!BLOOM(sepmask, ch))
11986 break;
11987 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
11988 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000011989 j--;
Victor Stinnerb3a60142013-04-09 22:19:21 +020011990 }
11991
Benjamin Peterson29060642009-01-31 22:14:21 +000011992 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011993 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011994
Victor Stinner7931d9a2011-11-04 00:22:48 +010011995 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011996}
11997
11998PyObject*
11999PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12000{
12001 unsigned char *data;
12002 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020012003 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012004
Victor Stinnerde636f32011-10-01 03:55:54 +020012005 if (PyUnicode_READY(self) == -1)
12006 return NULL;
12007
Victor Stinner684d5fd2012-05-03 02:32:34 +020012008 length = PyUnicode_GET_LENGTH(self);
12009 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020012010
Victor Stinner684d5fd2012-05-03 02:32:34 +020012011 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012012 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012013
Victor Stinnerde636f32011-10-01 03:55:54 +020012014 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012015 PyErr_SetString(PyExc_IndexError, "string index out of range");
12016 return NULL;
12017 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020012018 if (start >= length || end < start)
12019 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner12bab6d2011-10-01 01:53:49 +020012020
Victor Stinner684d5fd2012-05-03 02:32:34 +020012021 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020012022 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020012023 data = PyUnicode_1BYTE_DATA(self);
Victor Stinnerd3f08822012-05-29 12:57:52 +020012024 return _PyUnicode_FromASCII((char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020012025 }
12026 else {
12027 kind = PyUnicode_KIND(self);
12028 data = PyUnicode_1BYTE_DATA(self);
12029 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012030 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020012031 length);
12032 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012033}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012034
12035static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012036do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012037{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012038 Py_ssize_t len, i, j;
12039
12040 if (PyUnicode_READY(self) == -1)
12041 return NULL;
12042
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012043 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012044
Victor Stinnercc7af722013-04-09 22:39:24 +020012045 if (PyUnicode_IS_ASCII(self)) {
12046 Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12047
12048 i = 0;
12049 if (striptype != RIGHTSTRIP) {
12050 while (i < len) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012051 Py_UCS1 ch = data[i];
Victor Stinnercc7af722013-04-09 22:39:24 +020012052 if (!_Py_ascii_whitespace[ch])
12053 break;
12054 i++;
12055 }
12056 }
12057
12058 j = len;
12059 if (striptype != LEFTSTRIP) {
12060 j--;
12061 while (j >= i) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012062 Py_UCS1 ch = data[j];
Victor Stinnercc7af722013-04-09 22:39:24 +020012063 if (!_Py_ascii_whitespace[ch])
12064 break;
12065 j--;
12066 }
12067 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012068 }
12069 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012070 else {
12071 int kind = PyUnicode_KIND(self);
12072 void *data = PyUnicode_DATA(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012073
Victor Stinnercc7af722013-04-09 22:39:24 +020012074 i = 0;
12075 if (striptype != RIGHTSTRIP) {
12076 while (i < len) {
12077 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12078 if (!Py_UNICODE_ISSPACE(ch))
12079 break;
12080 i++;
12081 }
Victor Stinner9c79e412013-04-09 22:21:08 +020012082 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012083
12084 j = len;
12085 if (striptype != LEFTSTRIP) {
12086 j--;
12087 while (j >= i) {
12088 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12089 if (!Py_UNICODE_ISSPACE(ch))
12090 break;
12091 j--;
12092 }
12093 j++;
12094 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012095 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012096
Victor Stinner7931d9a2011-11-04 00:22:48 +010012097 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012098}
12099
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012100
12101static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012102do_argstrip(PyObject *self, int striptype, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012103{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012104 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012105
Serhiy Storchakac6792272013-10-19 21:03:34 +030012106 if (!PyArg_ParseTuple(args, stripformat[striptype], &sep))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012107 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012108
Benjamin Peterson14339b62009-01-31 16:36:08 +000012109 if (sep != NULL && sep != Py_None) {
12110 if (PyUnicode_Check(sep))
12111 return _PyUnicode_XStrip(self, striptype, sep);
12112 else {
12113 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012114 "%s arg must be None or str",
12115 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012116 return NULL;
12117 }
12118 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012119
Benjamin Peterson14339b62009-01-31 16:36:08 +000012120 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012121}
12122
12123
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012124PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012125 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012126\n\
12127Return a copy of the string S with leading and trailing\n\
12128whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012129If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012130
12131static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012132unicode_strip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012133{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012134 if (PyTuple_GET_SIZE(args) == 0)
12135 return do_strip(self, BOTHSTRIP); /* Common case */
12136 else
12137 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012138}
12139
12140
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012141PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012142 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012143\n\
12144Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012145If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012146
12147static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012148unicode_lstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012149{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012150 if (PyTuple_GET_SIZE(args) == 0)
12151 return do_strip(self, LEFTSTRIP); /* Common case */
12152 else
12153 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012154}
12155
12156
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012157PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012158 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012159\n\
12160Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012161If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012162
12163static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012164unicode_rstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012165{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012166 if (PyTuple_GET_SIZE(args) == 0)
12167 return do_strip(self, RIGHTSTRIP); /* Common case */
12168 else
12169 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012170}
12171
12172
Guido van Rossumd57fd912000-03-10 22:53:23 +000012173static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012174unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012175{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012176 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012177 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012178
Serhiy Storchaka05997252013-01-26 12:14:02 +020012179 if (len < 1)
12180 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012181
Victor Stinnerc4b49542011-12-11 22:44:26 +010012182 /* no repeat, return original string */
12183 if (len == 1)
12184 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000012185
Benjamin Petersonbac79492012-01-14 13:34:47 -050012186 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012187 return NULL;
12188
Victor Stinnerc759f3e2011-10-01 03:09:58 +020012189 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020012190 PyErr_SetString(PyExc_OverflowError,
12191 "repeated string is too long");
12192 return NULL;
12193 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012194 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012195
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012196 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012197 if (!u)
12198 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012199 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012200
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012201 if (PyUnicode_GET_LENGTH(str) == 1) {
12202 const int kind = PyUnicode_KIND(str);
12203 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010012204 if (kind == PyUnicode_1BYTE_KIND) {
12205 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012206 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010012207 }
12208 else if (kind == PyUnicode_2BYTE_KIND) {
12209 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012210 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010012211 ucs2[n] = fill_char;
12212 } else {
12213 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12214 assert(kind == PyUnicode_4BYTE_KIND);
12215 for (n = 0; n < len; ++n)
12216 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012217 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012218 }
12219 else {
12220 /* number of characters copied this far */
12221 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012222 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012223 char *to = (char *) PyUnicode_DATA(u);
12224 Py_MEMCPY(to, PyUnicode_DATA(str),
12225 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000012226 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012227 n = (done <= nchars-done) ? done : nchars-done;
12228 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012229 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000012230 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012231 }
12232
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012233 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012234 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012235}
12236
Alexander Belopolsky40018472011-02-26 01:02:56 +000012237PyObject *
12238PyUnicode_Replace(PyObject *obj,
12239 PyObject *subobj,
12240 PyObject *replobj,
12241 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012242{
12243 PyObject *self;
12244 PyObject *str1;
12245 PyObject *str2;
12246 PyObject *result;
12247
12248 self = PyUnicode_FromObject(obj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012249 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012250 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012251 str1 = PyUnicode_FromObject(subobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012252 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012253 Py_DECREF(self);
12254 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012255 }
12256 str2 = PyUnicode_FromObject(replobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012257 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012258 Py_DECREF(self);
12259 Py_DECREF(str1);
12260 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012261 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060012262 if (PyUnicode_READY(self) == -1 ||
12263 PyUnicode_READY(str1) == -1 ||
12264 PyUnicode_READY(str2) == -1)
12265 result = NULL;
12266 else
12267 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012268 Py_DECREF(self);
12269 Py_DECREF(str1);
12270 Py_DECREF(str2);
12271 return result;
12272}
12273
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012274PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000012275 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012276\n\
12277Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000012278old replaced by new. If the optional argument count is\n\
12279given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012280
12281static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012282unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012283{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012284 PyObject *str1;
12285 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012286 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012287 PyObject *result;
12288
Martin v. Löwis18e16552006-02-15 17:27:45 +000012289 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012290 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060012291 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012292 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012293 str1 = PyUnicode_FromObject(str1);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012294 if (str1 == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012295 return NULL;
12296 str2 = PyUnicode_FromObject(str2);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012297 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012298 Py_DECREF(str1);
12299 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000012300 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060012301 if (PyUnicode_READY(str1) == -1 || PyUnicode_READY(str2) == -1)
12302 result = NULL;
12303 else
12304 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012305
12306 Py_DECREF(str1);
12307 Py_DECREF(str2);
12308 return result;
12309}
12310
Alexander Belopolsky40018472011-02-26 01:02:56 +000012311static PyObject *
12312unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012313{
Walter Dörwald79e913e2007-05-12 11:08:06 +000012314 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012315 Py_ssize_t isize;
12316 Py_ssize_t osize, squote, dquote, i, o;
12317 Py_UCS4 max, quote;
Victor Stinner55c08782013-04-14 18:45:39 +020012318 int ikind, okind, unchanged;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012319 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012320
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012321 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012322 return NULL;
12323
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012324 isize = PyUnicode_GET_LENGTH(unicode);
12325 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012326
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012327 /* Compute length of output, quote characters, and
12328 maximum character */
Victor Stinner55c08782013-04-14 18:45:39 +020012329 osize = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012330 max = 127;
12331 squote = dquote = 0;
12332 ikind = PyUnicode_KIND(unicode);
12333 for (i = 0; i < isize; i++) {
12334 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12335 switch (ch) {
12336 case '\'': squote++; osize++; break;
12337 case '"': dquote++; osize++; break;
12338 case '\\': case '\t': case '\r': case '\n':
12339 osize += 2; break;
12340 default:
12341 /* Fast-path ASCII */
12342 if (ch < ' ' || ch == 0x7f)
12343 osize += 4; /* \xHH */
12344 else if (ch < 0x7f)
12345 osize++;
12346 else if (Py_UNICODE_ISPRINTABLE(ch)) {
12347 osize++;
12348 max = ch > max ? ch : max;
12349 }
12350 else if (ch < 0x100)
12351 osize += 4; /* \xHH */
12352 else if (ch < 0x10000)
12353 osize += 6; /* \uHHHH */
12354 else
12355 osize += 10; /* \uHHHHHHHH */
12356 }
12357 }
12358
12359 quote = '\'';
Victor Stinner55c08782013-04-14 18:45:39 +020012360 unchanged = (osize == isize);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012361 if (squote) {
Victor Stinner55c08782013-04-14 18:45:39 +020012362 unchanged = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012363 if (dquote)
12364 /* Both squote and dquote present. Use squote,
12365 and escape them */
12366 osize += squote;
12367 else
12368 quote = '"';
12369 }
Victor Stinner55c08782013-04-14 18:45:39 +020012370 osize += 2; /* quotes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012371
12372 repr = PyUnicode_New(osize, max);
12373 if (repr == NULL)
12374 return NULL;
12375 okind = PyUnicode_KIND(repr);
12376 odata = PyUnicode_DATA(repr);
12377
12378 PyUnicode_WRITE(okind, odata, 0, quote);
12379 PyUnicode_WRITE(okind, odata, osize-1, quote);
Victor Stinner55c08782013-04-14 18:45:39 +020012380 if (unchanged) {
12381 _PyUnicode_FastCopyCharacters(repr, 1,
12382 unicode, 0,
12383 isize);
12384 }
12385 else {
12386 for (i = 0, o = 1; i < isize; i++) {
12387 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012388
Victor Stinner55c08782013-04-14 18:45:39 +020012389 /* Escape quotes and backslashes */
12390 if ((ch == quote) || (ch == '\\')) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012391 PyUnicode_WRITE(okind, odata, o++, '\\');
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012392 PyUnicode_WRITE(okind, odata, o++, ch);
Victor Stinner55c08782013-04-14 18:45:39 +020012393 continue;
12394 }
12395
12396 /* Map special whitespace to '\t', \n', '\r' */
12397 if (ch == '\t') {
12398 PyUnicode_WRITE(okind, odata, o++, '\\');
12399 PyUnicode_WRITE(okind, odata, o++, 't');
12400 }
12401 else if (ch == '\n') {
12402 PyUnicode_WRITE(okind, odata, o++, '\\');
12403 PyUnicode_WRITE(okind, odata, o++, 'n');
12404 }
12405 else if (ch == '\r') {
12406 PyUnicode_WRITE(okind, odata, o++, '\\');
12407 PyUnicode_WRITE(okind, odata, o++, 'r');
12408 }
12409
12410 /* Map non-printable US ASCII to '\xhh' */
12411 else if (ch < ' ' || ch == 0x7F) {
12412 PyUnicode_WRITE(okind, odata, o++, '\\');
12413 PyUnicode_WRITE(okind, odata, o++, 'x');
12414 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12415 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12416 }
12417
12418 /* Copy ASCII characters as-is */
12419 else if (ch < 0x7F) {
12420 PyUnicode_WRITE(okind, odata, o++, ch);
12421 }
12422
12423 /* Non-ASCII characters */
12424 else {
12425 /* Map Unicode whitespace and control characters
12426 (categories Z* and C* except ASCII space)
12427 */
12428 if (!Py_UNICODE_ISPRINTABLE(ch)) {
12429 PyUnicode_WRITE(okind, odata, o++, '\\');
12430 /* Map 8-bit characters to '\xhh' */
12431 if (ch <= 0xff) {
12432 PyUnicode_WRITE(okind, odata, o++, 'x');
12433 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12434 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12435 }
12436 /* Map 16-bit characters to '\uxxxx' */
12437 else if (ch <= 0xffff) {
12438 PyUnicode_WRITE(okind, odata, o++, 'u');
12439 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12440 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12441 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12442 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12443 }
12444 /* Map 21-bit characters to '\U00xxxxxx' */
12445 else {
12446 PyUnicode_WRITE(okind, odata, o++, 'U');
12447 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12448 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12449 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12450 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12451 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12452 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12453 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12454 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12455 }
12456 }
12457 /* Copy characters as-is */
12458 else {
12459 PyUnicode_WRITE(okind, odata, o++, ch);
12460 }
Georg Brandl559e5d72008-06-11 18:37:52 +000012461 }
12462 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012463 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012464 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012465 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012466 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012467}
12468
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012469PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012470 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012471\n\
12472Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012473such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012474arguments start and end are interpreted as in slice notation.\n\
12475\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012476Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012477
12478static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012479unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012480{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012481 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012482 Py_ssize_t start;
12483 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012484 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012485
Jesus Ceaac451502011-04-20 17:09:23 +020012486 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
12487 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012488 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012489
Christian Heimesea71a522013-06-29 21:17:34 +020012490 if (PyUnicode_READY(self) == -1) {
12491 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012492 return NULL;
Christian Heimesea71a522013-06-29 21:17:34 +020012493 }
12494 if (PyUnicode_READY(substring) == -1) {
12495 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012496 return NULL;
Christian Heimesea71a522013-06-29 21:17:34 +020012497 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012498
Victor Stinner7931d9a2011-11-04 00:22:48 +010012499 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012500
12501 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012502
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012503 if (result == -2)
12504 return NULL;
12505
Christian Heimes217cfd12007-12-02 14:31:20 +000012506 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012507}
12508
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012509PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012510 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012511\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012512Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012513
12514static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012515unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012516{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012517 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012518 Py_ssize_t start;
12519 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012520 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012521
Jesus Ceaac451502011-04-20 17:09:23 +020012522 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
12523 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012524 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012525
Christian Heimesea71a522013-06-29 21:17:34 +020012526 if (PyUnicode_READY(self) == -1) {
12527 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012528 return NULL;
Christian Heimesea71a522013-06-29 21:17:34 +020012529 }
12530 if (PyUnicode_READY(substring) == -1) {
12531 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012532 return NULL;
Christian Heimesea71a522013-06-29 21:17:34 +020012533 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012534
Victor Stinner7931d9a2011-11-04 00:22:48 +010012535 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012536
12537 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012538
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012539 if (result == -2)
12540 return NULL;
12541
Guido van Rossumd57fd912000-03-10 22:53:23 +000012542 if (result < 0) {
12543 PyErr_SetString(PyExc_ValueError, "substring not found");
12544 return NULL;
12545 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012546
Christian Heimes217cfd12007-12-02 14:31:20 +000012547 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012548}
12549
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012550PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012551 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012552\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012553Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012554done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012555
12556static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012557unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012558{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012559 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012560 Py_UCS4 fillchar = ' ';
12561
Victor Stinnere9a29352011-10-01 02:14:59 +020012562 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012563 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012564
Benjamin Petersonbac79492012-01-14 13:34:47 -050012565 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012566 return NULL;
12567
Victor Stinnerc4b49542011-12-11 22:44:26 +010012568 if (PyUnicode_GET_LENGTH(self) >= width)
12569 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012570
Victor Stinnerc4b49542011-12-11 22:44:26 +010012571 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012572}
12573
Alexander Belopolsky40018472011-02-26 01:02:56 +000012574PyObject *
12575PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012576{
12577 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000012578
Guido van Rossumd57fd912000-03-10 22:53:23 +000012579 s = PyUnicode_FromObject(s);
12580 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012581 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012582 if (sep != NULL) {
12583 sep = PyUnicode_FromObject(sep);
12584 if (sep == NULL) {
12585 Py_DECREF(s);
12586 return NULL;
12587 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012588 }
12589
Victor Stinner9310abb2011-10-05 00:59:23 +020012590 result = split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012591
12592 Py_DECREF(s);
12593 Py_XDECREF(sep);
12594 return result;
12595}
12596
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012597PyDoc_STRVAR(split__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012598 "S.split(sep=None, maxsplit=-1) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012599\n\
12600Return a list of the words in S, using sep as the\n\
12601delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000012602splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000012603whitespace string is a separator and empty strings are\n\
12604removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012605
12606static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012607unicode_split(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012608{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012609 static char *kwlist[] = {"sep", "maxsplit", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000012610 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012611 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012612
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012613 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:split",
12614 kwlist, &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012615 return NULL;
12616
12617 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012618 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012619 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012620 return split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012621 else
Victor Stinner7931d9a2011-11-04 00:22:48 +010012622 return PyUnicode_Split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012623}
12624
Thomas Wouters477c8d52006-05-27 19:21:47 +000012625PyObject *
12626PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
12627{
12628 PyObject* str_obj;
12629 PyObject* sep_obj;
12630 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012631 int kind1, kind2, kind;
12632 void *buf1 = NULL, *buf2 = NULL;
12633 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012634
12635 str_obj = PyUnicode_FromObject(str_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012636 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012637 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012638 sep_obj = PyUnicode_FromObject(sep_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012639 if (!sep_obj) {
12640 Py_DECREF(str_obj);
12641 return NULL;
12642 }
12643 if (PyUnicode_READY(sep_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
12644 Py_DECREF(sep_obj);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012645 Py_DECREF(str_obj);
12646 return NULL;
12647 }
12648
Victor Stinner14f8f022011-10-05 20:58:25 +020012649 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012650 kind2 = PyUnicode_KIND(sep_obj);
Victor Stinner14f8f022011-10-05 20:58:25 +020012651 kind = Py_MAX(kind1, kind2);
12652 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012653 if (kind1 != kind)
Victor Stinner14f8f022011-10-05 20:58:25 +020012654 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012655 if (!buf1)
12656 goto onError;
12657 buf2 = PyUnicode_DATA(sep_obj);
12658 if (kind2 != kind)
12659 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12660 if (!buf2)
12661 goto onError;
12662 len1 = PyUnicode_GET_LENGTH(str_obj);
12663 len2 = PyUnicode_GET_LENGTH(sep_obj);
12664
Benjamin Petersonead6b532011-12-20 17:23:42 -060012665 switch (PyUnicode_KIND(str_obj)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012666 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012667 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12668 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12669 else
12670 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012671 break;
12672 case PyUnicode_2BYTE_KIND:
12673 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12674 break;
12675 case PyUnicode_4BYTE_KIND:
12676 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12677 break;
12678 default:
12679 assert(0);
12680 out = 0;
12681 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012682
12683 Py_DECREF(sep_obj);
12684 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012685 if (kind1 != kind)
12686 PyMem_Free(buf1);
12687 if (kind2 != kind)
12688 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012689
12690 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012691 onError:
12692 Py_DECREF(sep_obj);
12693 Py_DECREF(str_obj);
12694 if (kind1 != kind && buf1)
12695 PyMem_Free(buf1);
12696 if (kind2 != kind && buf2)
12697 PyMem_Free(buf2);
12698 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012699}
12700
12701
12702PyObject *
12703PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
12704{
12705 PyObject* str_obj;
12706 PyObject* sep_obj;
12707 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012708 int kind1, kind2, kind;
12709 void *buf1 = NULL, *buf2 = NULL;
12710 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012711
12712 str_obj = PyUnicode_FromObject(str_in);
12713 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012714 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012715 sep_obj = PyUnicode_FromObject(sep_in);
12716 if (!sep_obj) {
12717 Py_DECREF(str_obj);
12718 return NULL;
12719 }
12720
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012721 kind1 = PyUnicode_KIND(str_in);
12722 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020012723 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012724 buf1 = PyUnicode_DATA(str_in);
12725 if (kind1 != kind)
12726 buf1 = _PyUnicode_AsKind(str_in, kind);
12727 if (!buf1)
12728 goto onError;
12729 buf2 = PyUnicode_DATA(sep_obj);
12730 if (kind2 != kind)
12731 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12732 if (!buf2)
12733 goto onError;
12734 len1 = PyUnicode_GET_LENGTH(str_obj);
12735 len2 = PyUnicode_GET_LENGTH(sep_obj);
12736
Benjamin Petersonead6b532011-12-20 17:23:42 -060012737 switch (PyUnicode_KIND(str_in)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012738 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012739 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12740 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12741 else
12742 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012743 break;
12744 case PyUnicode_2BYTE_KIND:
12745 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12746 break;
12747 case PyUnicode_4BYTE_KIND:
12748 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12749 break;
12750 default:
12751 assert(0);
12752 out = 0;
12753 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012754
12755 Py_DECREF(sep_obj);
12756 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012757 if (kind1 != kind)
12758 PyMem_Free(buf1);
12759 if (kind2 != kind)
12760 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012761
12762 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012763 onError:
12764 Py_DECREF(sep_obj);
12765 Py_DECREF(str_obj);
12766 if (kind1 != kind && buf1)
12767 PyMem_Free(buf1);
12768 if (kind2 != kind && buf2)
12769 PyMem_Free(buf2);
12770 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012771}
12772
12773PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012774 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012775\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012776Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012777the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012778found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012779
12780static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012781unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012782{
Victor Stinner9310abb2011-10-05 00:59:23 +020012783 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012784}
12785
12786PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000012787 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012788\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012789Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012790the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012791separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012792
12793static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012794unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012795{
Victor Stinner9310abb2011-10-05 00:59:23 +020012796 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012797}
12798
Alexander Belopolsky40018472011-02-26 01:02:56 +000012799PyObject *
12800PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012801{
12802 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012803
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012804 s = PyUnicode_FromObject(s);
12805 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012806 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012807 if (sep != NULL) {
12808 sep = PyUnicode_FromObject(sep);
12809 if (sep == NULL) {
12810 Py_DECREF(s);
12811 return NULL;
12812 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012813 }
12814
Victor Stinner9310abb2011-10-05 00:59:23 +020012815 result = rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012816
12817 Py_DECREF(s);
12818 Py_XDECREF(sep);
12819 return result;
12820}
12821
12822PyDoc_STRVAR(rsplit__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012823 "S.rsplit(sep=None, maxsplit=-1) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012824\n\
12825Return a list of the words in S, using sep as the\n\
12826delimiter string, starting at the end of the string and\n\
12827working to the front. If maxsplit is given, at most maxsplit\n\
12828splits are done. If sep is not specified, any whitespace string\n\
12829is a separator.");
12830
12831static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012832unicode_rsplit(PyObject *self, PyObject *args, PyObject *kwds)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012833{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012834 static char *kwlist[] = {"sep", "maxsplit", 0};
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012835 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012836 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012837
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012838 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:rsplit",
12839 kwlist, &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012840 return NULL;
12841
12842 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012843 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012844 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012845 return rsplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012846 else
Victor Stinner9310abb2011-10-05 00:59:23 +020012847 return PyUnicode_RSplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012848}
12849
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012850PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012851 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012852\n\
12853Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000012854Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012855is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012856
12857static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012858unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012859{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012860 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000012861 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012862
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012863 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12864 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012865 return NULL;
12866
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012867 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012868}
12869
12870static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000012871PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012872{
Victor Stinnerc4b49542011-12-11 22:44:26 +010012873 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012874}
12875
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012876PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012877 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012878\n\
12879Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012880and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012881
12882static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012883unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012884{
Benjamin Petersoneea48462012-01-16 14:28:50 -050012885 if (PyUnicode_READY(self) == -1)
12886 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012887 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012888}
12889
Larry Hastings61272b72014-01-07 12:41:53 -080012890/*[clinic input]
Georg Brandlceee0772007-11-27 23:48:05 +000012891
Larry Hastings31826802013-10-19 00:09:25 -070012892@staticmethod
12893str.maketrans as unicode_maketrans
12894
12895 x: object
12896
12897 y: unicode=NULL
12898
12899 z: unicode=NULL
12900
12901 /
12902
12903Return a translation table usable for str.translate().
12904
12905If there is only one argument, it must be a dictionary mapping Unicode
12906ordinals (integers) or characters to Unicode ordinals, strings or None.
12907Character keys will be then converted to ordinals.
12908If there are two arguments, they must be strings of equal length, and
12909in the resulting dictionary, each character in x will be mapped to the
12910character at the same position in y. If there is a third argument, it
12911must be a string, whose characters will be mapped to None in the result.
Larry Hastings61272b72014-01-07 12:41:53 -080012912[clinic start generated code]*/
Larry Hastings31826802013-10-19 00:09:25 -070012913
12914PyDoc_STRVAR(unicode_maketrans__doc__,
Larry Hastings2623c8c2014-02-08 22:15:29 -080012915"maketrans(x, y=None, z=None, /)\n"
12916"--\n"
12917"\n"
Larry Hastings31826802013-10-19 00:09:25 -070012918"Return a translation table usable for str.translate().\n"
12919"\n"
Larry Hastings31826802013-10-19 00:09:25 -070012920"If there is only one argument, it must be a dictionary mapping Unicode\n"
12921"ordinals (integers) or characters to Unicode ordinals, strings or None.\n"
12922"Character keys will be then converted to ordinals.\n"
12923"If there are two arguments, they must be strings of equal length, and\n"
12924"in the resulting dictionary, each character in x will be mapped to the\n"
12925"character at the same position in y. If there is a third argument, it\n"
12926"must be a string, whose characters will be mapped to None in the result.");
12927
12928#define UNICODE_MAKETRANS_METHODDEF \
12929 {"maketrans", (PyCFunction)unicode_maketrans, METH_VARARGS|METH_STATIC, unicode_maketrans__doc__},
12930
12931static PyObject *
Larry Hastings5c661892014-01-24 06:17:25 -080012932unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z);
Larry Hastings31826802013-10-19 00:09:25 -070012933
12934static PyObject *
Larry Hastingsebdcb502013-11-23 14:54:00 -080012935unicode_maketrans(void *null, PyObject *args)
Georg Brandlceee0772007-11-27 23:48:05 +000012936{
Larry Hastings31826802013-10-19 00:09:25 -070012937 PyObject *return_value = NULL;
12938 PyObject *x;
12939 PyObject *y = NULL;
12940 PyObject *z = NULL;
12941
12942 if (!PyArg_ParseTuple(args,
12943 "O|UU:maketrans",
12944 &x, &y, &z))
12945 goto exit;
Larry Hastings5c661892014-01-24 06:17:25 -080012946 return_value = unicode_maketrans_impl(x, y, z);
Larry Hastings31826802013-10-19 00:09:25 -070012947
12948exit:
12949 return return_value;
12950}
12951
12952static PyObject *
Larry Hastings5c661892014-01-24 06:17:25 -080012953unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
Larry Hastings2623c8c2014-02-08 22:15:29 -080012954/*[clinic end generated code: output=566edf630f77436a input=7bfbf529a293c6c5]*/
Larry Hastings31826802013-10-19 00:09:25 -070012955{
Georg Brandlceee0772007-11-27 23:48:05 +000012956 PyObject *new = NULL, *key, *value;
12957 Py_ssize_t i = 0;
12958 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012959
Georg Brandlceee0772007-11-27 23:48:05 +000012960 new = PyDict_New();
12961 if (!new)
12962 return NULL;
12963 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012964 int x_kind, y_kind, z_kind;
12965 void *x_data, *y_data, *z_data;
12966
Georg Brandlceee0772007-11-27 23:48:05 +000012967 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000012968 if (!PyUnicode_Check(x)) {
12969 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12970 "be a string if there is a second argument");
12971 goto err;
12972 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012973 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012974 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12975 "arguments must have equal length");
12976 goto err;
12977 }
12978 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012979 x_kind = PyUnicode_KIND(x);
12980 y_kind = PyUnicode_KIND(y);
12981 x_data = PyUnicode_DATA(x);
12982 y_data = PyUnicode_DATA(y);
12983 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
12984 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012985 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000012986 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060012987 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012988 if (!value) {
12989 Py_DECREF(key);
12990 goto err;
12991 }
Georg Brandlceee0772007-11-27 23:48:05 +000012992 res = PyDict_SetItem(new, key, value);
12993 Py_DECREF(key);
12994 Py_DECREF(value);
12995 if (res < 0)
12996 goto err;
12997 }
12998 /* create entries for deleting chars in z */
12999 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013000 z_kind = PyUnicode_KIND(z);
13001 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013002 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013003 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000013004 if (!key)
13005 goto err;
13006 res = PyDict_SetItem(new, key, Py_None);
13007 Py_DECREF(key);
13008 if (res < 0)
13009 goto err;
13010 }
13011 }
13012 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013013 int kind;
13014 void *data;
13015
Georg Brandlceee0772007-11-27 23:48:05 +000013016 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000013017 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013018 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13019 "to maketrans it must be a dict");
13020 goto err;
13021 }
13022 /* copy entries into the new dict, converting string keys to int keys */
13023 while (PyDict_Next(x, &i, &key, &value)) {
13024 if (PyUnicode_Check(key)) {
13025 /* convert string keys to integer keys */
13026 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013027 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000013028 PyErr_SetString(PyExc_ValueError, "string keys in translate "
13029 "table must be of length 1");
13030 goto err;
13031 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013032 kind = PyUnicode_KIND(key);
13033 data = PyUnicode_DATA(key);
13034 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000013035 if (!newkey)
13036 goto err;
13037 res = PyDict_SetItem(new, newkey, value);
13038 Py_DECREF(newkey);
13039 if (res < 0)
13040 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000013041 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013042 /* just keep integer keys */
13043 if (PyDict_SetItem(new, key, value) < 0)
13044 goto err;
13045 } else {
13046 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13047 "be strings or integers");
13048 goto err;
13049 }
13050 }
13051 }
13052 return new;
13053 err:
13054 Py_DECREF(new);
13055 return NULL;
13056}
13057
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013058PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013059 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013060\n\
13061Return a copy of the string S, where all characters have been mapped\n\
13062through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000013063Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000013064Unmapped characters are left untouched. Characters mapped to None\n\
13065are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013066
13067static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013068unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013069{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013070 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013071}
13072
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013073PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013074 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013075\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013076Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013077
13078static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020013079unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013080{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050013081 if (PyUnicode_READY(self) == -1)
13082 return NULL;
13083 if (PyUnicode_IS_ASCII(self))
13084 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013085 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013086}
13087
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013088PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013089 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013090\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000013091Pad a numeric string S with zeros on the left, to fill a field\n\
13092of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013093
13094static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020013095unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013096{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013097 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020013098 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013099 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013100 int kind;
13101 void *data;
13102 Py_UCS4 chr;
13103
Martin v. Löwis18e16552006-02-15 17:27:45 +000013104 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000013105 return NULL;
13106
Benjamin Petersonbac79492012-01-14 13:34:47 -050013107 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010013108 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013109
Victor Stinnerc4b49542011-12-11 22:44:26 +010013110 if (PyUnicode_GET_LENGTH(self) >= width)
13111 return unicode_result_unchanged(self);
13112
13113 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013114
13115 u = pad(self, fill, 0, '0');
13116
Walter Dörwald068325e2002-04-15 13:36:47 +000013117 if (u == NULL)
13118 return NULL;
13119
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013120 kind = PyUnicode_KIND(u);
13121 data = PyUnicode_DATA(u);
13122 chr = PyUnicode_READ(kind, data, fill);
13123
13124 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000013125 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013126 PyUnicode_WRITE(kind, data, 0, chr);
13127 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000013128 }
13129
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013130 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010013131 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013132}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013133
13134#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013135static PyObject *
13136unicode__decimal2ascii(PyObject *self)
13137{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013138 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013139}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013140#endif
13141
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013142PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013143 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013144\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013145Return True if S starts with the specified prefix, False otherwise.\n\
13146With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013147With optional end, stop comparing S at that position.\n\
13148prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013149
13150static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013151unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013152 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013153{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013154 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013155 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013156 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013157 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013158 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013159
Jesus Ceaac451502011-04-20 17:09:23 +020013160 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013161 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013162 if (PyTuple_Check(subobj)) {
13163 Py_ssize_t i;
13164 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013165 substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013166 if (substring == NULL)
13167 return NULL;
13168 result = tailmatch(self, substring, start, end, -1);
13169 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010013170 if (result == -1)
13171 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013172 if (result) {
13173 Py_RETURN_TRUE;
13174 }
13175 }
13176 /* nothing matched */
13177 Py_RETURN_FALSE;
13178 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013179 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030013180 if (substring == NULL) {
13181 if (PyErr_ExceptionMatches(PyExc_TypeError))
13182 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
13183 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013184 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013185 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013186 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013187 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010013188 if (result == -1)
13189 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013190 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013191}
13192
13193
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013194PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013195 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013196\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013197Return True if S ends with the specified suffix, False otherwise.\n\
13198With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013199With optional end, stop comparing S at that position.\n\
13200suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013201
13202static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013203unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013204 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013205{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013206 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013207 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013208 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013209 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013210 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013211
Jesus Ceaac451502011-04-20 17:09:23 +020013212 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013213 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013214 if (PyTuple_Check(subobj)) {
13215 Py_ssize_t i;
13216 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013217 substring = PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000013218 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013219 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000013220 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013221 result = tailmatch(self, substring, start, end, +1);
13222 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010013223 if (result == -1)
13224 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013225 if (result) {
13226 Py_RETURN_TRUE;
13227 }
13228 }
13229 Py_RETURN_FALSE;
13230 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013231 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030013232 if (substring == NULL) {
13233 if (PyErr_ExceptionMatches(PyExc_TypeError))
13234 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
13235 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013236 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013237 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013238 result = tailmatch(self, substring, start, end, +1);
Christian Heimes305e49e2013-06-29 20:41:06 +020013239 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010013240 if (result == -1)
13241 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013242 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013243}
13244
Victor Stinner202fdca2012-05-07 12:47:02 +020013245Py_LOCAL_INLINE(void)
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013246_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013247{
Victor Stinner8f674cc2013-04-17 23:02:17 +020013248 if (!writer->readonly)
13249 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
13250 else {
13251 /* Copy-on-write mode: set buffer size to 0 so
13252 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13253 * next write. */
13254 writer->size = 0;
13255 }
Victor Stinner202fdca2012-05-07 12:47:02 +020013256 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13257 writer->data = PyUnicode_DATA(writer->buffer);
13258 writer->kind = PyUnicode_KIND(writer->buffer);
13259}
13260
Victor Stinnerd3f08822012-05-29 12:57:52 +020013261void
Victor Stinner8f674cc2013-04-17 23:02:17 +020013262_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013263{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013264 memset(writer, 0, sizeof(*writer));
13265#ifdef Py_DEBUG
13266 writer->kind = 5; /* invalid kind */
13267#endif
Victor Stinner8f674cc2013-04-17 23:02:17 +020013268 writer->min_char = 127;
Victor Stinner202fdca2012-05-07 12:47:02 +020013269}
13270
Victor Stinnerd3f08822012-05-29 12:57:52 +020013271int
13272_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13273 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020013274{
Victor Stinner6989ba02013-11-18 21:08:39 +010013275#ifdef MS_WINDOWS
13276 /* On Windows, overallocate by 50% is the best factor */
13277# define OVERALLOCATE_FACTOR 2
13278#else
13279 /* On Linux, overallocate by 25% is the best factor */
13280# define OVERALLOCATE_FACTOR 4
13281#endif
Victor Stinner202fdca2012-05-07 12:47:02 +020013282 Py_ssize_t newlen;
13283 PyObject *newbuffer;
13284
Victor Stinnerd3f08822012-05-29 12:57:52 +020013285 assert(length > 0);
13286
Victor Stinner202fdca2012-05-07 12:47:02 +020013287 if (length > PY_SSIZE_T_MAX - writer->pos) {
13288 PyErr_NoMemory();
13289 return -1;
13290 }
13291 newlen = writer->pos + length;
13292
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070013293 maxchar = Py_MAX(maxchar, writer->min_char);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013294
Victor Stinnerd3f08822012-05-29 12:57:52 +020013295 if (writer->buffer == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020013296 assert(!writer->readonly);
Victor Stinner6989ba02013-11-18 21:08:39 +010013297 if (writer->overallocate
13298 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13299 /* overallocate to limit the number of realloc() */
13300 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013301 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013302 if (newlen < writer->min_length)
13303 newlen = writer->min_length;
13304
Victor Stinnerd3f08822012-05-29 12:57:52 +020013305 writer->buffer = PyUnicode_New(newlen, maxchar);
13306 if (writer->buffer == NULL)
13307 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013308 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013309 else if (newlen > writer->size) {
Victor Stinner6989ba02013-11-18 21:08:39 +010013310 if (writer->overallocate
13311 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13312 /* overallocate to limit the number of realloc() */
13313 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013314 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013315 if (newlen < writer->min_length)
13316 newlen = writer->min_length;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013317
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013318 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020013319 /* resize + widen */
13320 newbuffer = PyUnicode_New(newlen, maxchar);
13321 if (newbuffer == NULL)
13322 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013323 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13324 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020013325 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013326 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020013327 }
13328 else {
13329 newbuffer = resize_compact(writer->buffer, newlen);
13330 if (newbuffer == NULL)
13331 return -1;
13332 }
13333 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013334 }
13335 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013336 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013337 newbuffer = PyUnicode_New(writer->size, maxchar);
13338 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020013339 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013340 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13341 writer->buffer, 0, writer->pos);
13342 Py_DECREF(writer->buffer);
13343 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013344 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013345 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013346 return 0;
Victor Stinner6989ba02013-11-18 21:08:39 +010013347
13348#undef OVERALLOCATE_FACTOR
Victor Stinner202fdca2012-05-07 12:47:02 +020013349}
13350
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013351Py_LOCAL_INLINE(int)
13352_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
Victor Stinnera0dd0212013-04-11 22:09:04 +020013353{
13354 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13355 return -1;
13356 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13357 writer->pos++;
13358 return 0;
13359}
13360
13361int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013362_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13363{
13364 return _PyUnicodeWriter_WriteCharInline(writer, ch);
13365}
13366
13367int
Victor Stinnerd3f08822012-05-29 12:57:52 +020013368_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13369{
13370 Py_UCS4 maxchar;
13371 Py_ssize_t len;
13372
13373 if (PyUnicode_READY(str) == -1)
13374 return -1;
13375 len = PyUnicode_GET_LENGTH(str);
13376 if (len == 0)
13377 return 0;
13378 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13379 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013380 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020013381 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013382 Py_INCREF(str);
13383 writer->buffer = str;
13384 _PyUnicodeWriter_Update(writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013385 writer->pos += len;
13386 return 0;
13387 }
13388 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13389 return -1;
13390 }
13391 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13392 str, 0, len);
13393 writer->pos += len;
13394 return 0;
13395}
13396
Victor Stinnere215d962012-10-06 23:03:36 +020013397int
Victor Stinnercfc4c132013-04-03 01:48:39 +020013398_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13399 Py_ssize_t start, Py_ssize_t end)
13400{
13401 Py_UCS4 maxchar;
13402 Py_ssize_t len;
13403
13404 if (PyUnicode_READY(str) == -1)
13405 return -1;
13406
13407 assert(0 <= start);
13408 assert(end <= PyUnicode_GET_LENGTH(str));
13409 assert(start <= end);
13410
13411 if (end == 0)
13412 return 0;
13413
13414 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
13415 return _PyUnicodeWriter_WriteStr(writer, str);
13416
13417 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
13418 maxchar = _PyUnicode_FindMaxChar(str, start, end);
13419 else
13420 maxchar = writer->maxchar;
13421 len = end - start;
13422
13423 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
13424 return -1;
13425
13426 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13427 str, start, len);
13428 writer->pos += len;
13429 return 0;
13430}
13431
13432int
Victor Stinner4a587072013-11-19 12:54:53 +010013433_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
13434 const char *ascii, Py_ssize_t len)
13435{
13436 if (len == -1)
13437 len = strlen(ascii);
13438
13439 assert(ucs1lib_find_max_char((Py_UCS1*)ascii, (Py_UCS1*)ascii + len) < 128);
13440
13441 if (writer->buffer == NULL && !writer->overallocate) {
13442 PyObject *str;
13443
13444 str = _PyUnicode_FromASCII(ascii, len);
13445 if (str == NULL)
13446 return -1;
13447
13448 writer->readonly = 1;
13449 writer->buffer = str;
13450 _PyUnicodeWriter_Update(writer);
13451 writer->pos += len;
13452 return 0;
13453 }
13454
13455 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
13456 return -1;
13457
13458 switch (writer->kind)
13459 {
13460 case PyUnicode_1BYTE_KIND:
13461 {
13462 const Py_UCS1 *str = (const Py_UCS1 *)ascii;
13463 Py_UCS1 *data = writer->data;
13464
13465 Py_MEMCPY(data + writer->pos, str, len);
13466 break;
13467 }
13468 case PyUnicode_2BYTE_KIND:
13469 {
13470 _PyUnicode_CONVERT_BYTES(
13471 Py_UCS1, Py_UCS2,
13472 ascii, ascii + len,
13473 (Py_UCS2 *)writer->data + writer->pos);
13474 break;
13475 }
13476 case PyUnicode_4BYTE_KIND:
13477 {
13478 _PyUnicode_CONVERT_BYTES(
13479 Py_UCS1, Py_UCS4,
13480 ascii, ascii + len,
13481 (Py_UCS4 *)writer->data + writer->pos);
13482 break;
13483 }
13484 default:
13485 assert(0);
13486 }
13487
13488 writer->pos += len;
13489 return 0;
13490}
13491
13492int
13493_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
13494 const char *str, Py_ssize_t len)
Victor Stinnere215d962012-10-06 23:03:36 +020013495{
13496 Py_UCS4 maxchar;
13497
13498 maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len);
13499 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
13500 return -1;
13501 unicode_write_cstr(writer->buffer, writer->pos, str, len);
13502 writer->pos += len;
13503 return 0;
13504}
13505
Victor Stinnerd3f08822012-05-29 12:57:52 +020013506PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013507_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013508{
Victor Stinner15a0bd32013-07-08 22:29:55 +020013509 PyObject *str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013510 if (writer->pos == 0) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013511 Py_CLEAR(writer->buffer);
Serhiy Storchaka678db842013-01-26 12:16:36 +020013512 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3f08822012-05-29 12:57:52 +020013513 }
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013514 if (writer->readonly) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013515 str = writer->buffer;
13516 writer->buffer = NULL;
13517 assert(PyUnicode_GET_LENGTH(str) == writer->pos);
13518 return str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013519 }
13520 if (PyUnicode_GET_LENGTH(writer->buffer) != writer->pos) {
13521 PyObject *newbuffer;
13522 newbuffer = resize_compact(writer->buffer, writer->pos);
13523 if (newbuffer == NULL) {
Serhiy Storchakadfe98a12014-02-09 13:46:20 +020013524 Py_CLEAR(writer->buffer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013525 return NULL;
13526 }
13527 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013528 }
Victor Stinner15a0bd32013-07-08 22:29:55 +020013529 str = writer->buffer;
13530 writer->buffer = NULL;
13531 assert(_PyUnicode_CheckConsistency(str, 1));
13532 return unicode_result_ready(str);
Victor Stinner202fdca2012-05-07 12:47:02 +020013533}
13534
Victor Stinnerd3f08822012-05-29 12:57:52 +020013535void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013536_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013537{
13538 Py_CLEAR(writer->buffer);
13539}
13540
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013541#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000013542
13543PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013544 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013545\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013546Return a formatted version of S, using substitutions from args and kwargs.\n\
13547The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000013548
Eric Smith27bbca62010-11-04 17:06:58 +000013549PyDoc_STRVAR(format_map__doc__,
13550 "S.format_map(mapping) -> str\n\
13551\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013552Return a formatted version of S, using substitutions from mapping.\n\
13553The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000013554
Eric Smith4a7d76d2008-05-30 18:10:19 +000013555static PyObject *
13556unicode__format__(PyObject* self, PyObject* args)
13557{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013558 PyObject *format_spec;
13559 _PyUnicodeWriter writer;
13560 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000013561
13562 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
13563 return NULL;
13564
Victor Stinnerd3f08822012-05-29 12:57:52 +020013565 if (PyUnicode_READY(self) == -1)
13566 return NULL;
Victor Stinner8f674cc2013-04-17 23:02:17 +020013567 _PyUnicodeWriter_Init(&writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013568 ret = _PyUnicode_FormatAdvancedWriter(&writer,
13569 self, format_spec, 0,
13570 PyUnicode_GET_LENGTH(format_spec));
13571 if (ret == -1) {
13572 _PyUnicodeWriter_Dealloc(&writer);
13573 return NULL;
13574 }
13575 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000013576}
13577
Eric Smith8c663262007-08-25 02:26:07 +000013578PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013579 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013580\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013581Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000013582
13583static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013584unicode__sizeof__(PyObject *v)
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013585{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013586 Py_ssize_t size;
13587
13588 /* If it's a compact object, account for base structure +
13589 character data. */
13590 if (PyUnicode_IS_COMPACT_ASCII(v))
13591 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
13592 else if (PyUnicode_IS_COMPACT(v))
13593 size = sizeof(PyCompactUnicodeObject) +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013594 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013595 else {
13596 /* If it is a two-block object, account for base object, and
13597 for character block if present. */
13598 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020013599 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013600 size += (PyUnicode_GET_LENGTH(v) + 1) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013601 PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013602 }
13603 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020013604 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020013605 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013606 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020013607 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020013608 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013609
13610 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013611}
13612
13613PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013614 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013615
13616static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020013617unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013618{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010013619 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013620 if (!copy)
13621 return NULL;
13622 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013623}
13624
Guido van Rossumd57fd912000-03-10 22:53:23 +000013625static PyMethodDef unicode_methods[] = {
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000013626 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013627 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
Ezio Melotticda6b6d2012-02-26 09:39:55 +020013628 {"split", (PyCFunction) unicode_split, METH_VARARGS | METH_KEYWORDS, split__doc__},
13629 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS | METH_KEYWORDS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013630 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
13631 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
Benjamin Petersond5890c82012-01-14 13:23:30 -050013632 {"casefold", (PyCFunction) unicode_casefold, METH_NOARGS, casefold__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013633 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
13634 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
13635 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
Ezio Melotti745d54d2013-11-16 19:10:57 +020013636 {"expandtabs", (PyCFunction) unicode_expandtabs,
13637 METH_VARARGS | METH_KEYWORDS, expandtabs__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013638 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013639 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013640 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
13641 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
13642 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013643 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013644 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13645 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
13646 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013647 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013648 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Ezio Melotti745d54d2013-11-16 19:10:57 +020013649 {"splitlines", (PyCFunction) unicode_splitlines,
13650 METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013651 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013652 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
13653 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
13654 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
13655 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13656 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
13657 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
13658 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
13659 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
13660 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
13661 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
13662 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
13663 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
13664 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
13665 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000013666 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000013667 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013668 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000013669 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000013670 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000013671 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Larry Hastings31826802013-10-19 00:09:25 -070013672 UNICODE_MAKETRANS_METHODDEF
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013673 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000013674#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013675 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013676 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013677#endif
13678
Benjamin Peterson14339b62009-01-31 16:36:08 +000013679 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013680 {NULL, NULL}
13681};
13682
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013683static PyObject *
13684unicode_mod(PyObject *v, PyObject *w)
13685{
Brian Curtindfc80e32011-08-10 20:28:54 -050013686 if (!PyUnicode_Check(v))
13687 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000013688 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013689}
13690
13691static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013692 0, /*nb_add*/
13693 0, /*nb_subtract*/
13694 0, /*nb_multiply*/
13695 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013696};
13697
Guido van Rossumd57fd912000-03-10 22:53:23 +000013698static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013699 (lenfunc) unicode_length, /* sq_length */
13700 PyUnicode_Concat, /* sq_concat */
13701 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13702 (ssizeargfunc) unicode_getitem, /* sq_item */
13703 0, /* sq_slice */
13704 0, /* sq_ass_item */
13705 0, /* sq_ass_slice */
13706 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013707};
13708
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013709static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013710unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013711{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013712 if (PyUnicode_READY(self) == -1)
13713 return NULL;
13714
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013715 if (PyIndex_Check(item)) {
13716 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013717 if (i == -1 && PyErr_Occurred())
13718 return NULL;
13719 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013720 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013721 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013722 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000013723 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013724 PyObject *result;
13725 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013726 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013727 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013728
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013729 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000013730 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013731 return NULL;
13732 }
13733
13734 if (slicelength <= 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020013735 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013736 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010013737 slicelength == PyUnicode_GET_LENGTH(self)) {
13738 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000013739 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013740 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013741 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013742 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013743 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013744 src_kind = PyUnicode_KIND(self);
13745 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020013746 if (!PyUnicode_IS_ASCII(self)) {
13747 kind_limit = kind_maxchar_limit(src_kind);
13748 max_char = 0;
13749 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13750 ch = PyUnicode_READ(src_kind, src_data, cur);
13751 if (ch > max_char) {
13752 max_char = ch;
13753 if (max_char >= kind_limit)
13754 break;
13755 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013756 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013757 }
Victor Stinner55c99112011-10-13 01:17:06 +020013758 else
13759 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013760 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013761 if (result == NULL)
13762 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013763 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013764 dest_data = PyUnicode_DATA(result);
13765
13766 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013767 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13768 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013769 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013770 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013771 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013772 } else {
13773 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13774 return NULL;
13775 }
13776}
13777
13778static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013779 (lenfunc)unicode_length, /* mp_length */
13780 (binaryfunc)unicode_subscript, /* mp_subscript */
13781 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013782};
13783
Guido van Rossumd57fd912000-03-10 22:53:23 +000013784
Guido van Rossumd57fd912000-03-10 22:53:23 +000013785/* Helpers for PyUnicode_Format() */
13786
Victor Stinnera47082312012-10-04 02:19:54 +020013787struct unicode_formatter_t {
13788 PyObject *args;
13789 int args_owned;
13790 Py_ssize_t arglen, argidx;
13791 PyObject *dict;
13792
13793 enum PyUnicode_Kind fmtkind;
13794 Py_ssize_t fmtcnt, fmtpos;
13795 void *fmtdata;
13796 PyObject *fmtstr;
13797
13798 _PyUnicodeWriter writer;
13799};
13800
13801struct unicode_format_arg_t {
13802 Py_UCS4 ch;
13803 int flags;
13804 Py_ssize_t width;
13805 int prec;
13806 int sign;
13807};
13808
Guido van Rossumd57fd912000-03-10 22:53:23 +000013809static PyObject *
Victor Stinnera47082312012-10-04 02:19:54 +020013810unicode_format_getnextarg(struct unicode_formatter_t *ctx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013811{
Victor Stinnera47082312012-10-04 02:19:54 +020013812 Py_ssize_t argidx = ctx->argidx;
13813
13814 if (argidx < ctx->arglen) {
13815 ctx->argidx++;
13816 if (ctx->arglen < 0)
13817 return ctx->args;
Benjamin Peterson29060642009-01-31 22:14:21 +000013818 else
Victor Stinnera47082312012-10-04 02:19:54 +020013819 return PyTuple_GetItem(ctx->args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013820 }
13821 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013822 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013823 return NULL;
13824}
13825
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013826/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013827
Victor Stinnera47082312012-10-04 02:19:54 +020013828/* Format a float into the writer if the writer is not NULL, or into *p_output
13829 otherwise.
13830
13831 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +020013832static int
Victor Stinnera47082312012-10-04 02:19:54 +020013833formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
13834 PyObject **p_output,
13835 _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013836{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013837 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013838 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013839 Py_ssize_t len;
Victor Stinnera47082312012-10-04 02:19:54 +020013840 int prec;
13841 int dtoa_flags;
Tim Petersced69f82003-09-16 20:30:58 +000013842
Guido van Rossumd57fd912000-03-10 22:53:23 +000013843 x = PyFloat_AsDouble(v);
13844 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020013845 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013846
Victor Stinnera47082312012-10-04 02:19:54 +020013847 prec = arg->prec;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013848 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013849 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000013850
Victor Stinnera47082312012-10-04 02:19:54 +020013851 if (arg->flags & F_ALT)
13852 dtoa_flags = Py_DTSF_ALT;
13853 else
13854 dtoa_flags = 0;
13855 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013856 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020013857 return -1;
13858 len = strlen(p);
13859 if (writer) {
Victor Stinner4a587072013-11-19 12:54:53 +010013860 if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
Christian Heimesf4f99392012-09-10 11:48:41 +020013861 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013862 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020013863 }
Victor Stinnerd3f08822012-05-29 12:57:52 +020013864 }
13865 else
13866 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000013867 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013868 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013869}
13870
Victor Stinnerd0880d52012-04-27 23:40:13 +020013871/* formatlong() emulates the format codes d, u, o, x and X, and
13872 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
13873 * Python's regular ints.
13874 * Return value: a new PyUnicodeObject*, or NULL if error.
13875 * The output string is of the form
13876 * "-"? ("0x" | "0X")? digit+
13877 * "0x"/"0X" are present only for x and X conversions, with F_ALT
13878 * set in flags. The case of hex digits will be correct,
13879 * There will be at least prec digits, zero-filled on the left if
13880 * necessary to get that many.
13881 * val object to be converted
13882 * flags bitmask of format flags; only F_ALT is looked at
13883 * prec minimum number of digits; 0-fill on left if needed
13884 * type a character in [duoxX]; u acts the same as d
13885 *
13886 * CAUTION: o, x and X conversions on regular ints can never
13887 * produce a '-' sign, but can for Python's unbounded ints.
13888 */
Tim Peters38fd5b62000-09-21 05:43:11 +000013889static PyObject*
Victor Stinnera47082312012-10-04 02:19:54 +020013890formatlong(PyObject *val, struct unicode_format_arg_t *arg)
Tim Peters38fd5b62000-09-21 05:43:11 +000013891{
Victor Stinnerd0880d52012-04-27 23:40:13 +020013892 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013893 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013894 Py_ssize_t i;
13895 int sign; /* 1 if '-', else 0 */
13896 int len; /* number of characters */
13897 Py_ssize_t llen;
13898 int numdigits; /* len == numnondigits + numdigits */
13899 int numnondigits = 0;
Victor Stinnera47082312012-10-04 02:19:54 +020013900 int prec = arg->prec;
13901 int type = arg->ch;
Tim Peters38fd5b62000-09-21 05:43:11 +000013902
Victor Stinnerd0880d52012-04-27 23:40:13 +020013903 /* Avoid exceeding SSIZE_T_MAX */
13904 if (prec > INT_MAX-3) {
13905 PyErr_SetString(PyExc_OverflowError,
13906 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000013907 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013908 }
13909
13910 assert(PyLong_Check(val));
13911
13912 switch (type) {
Victor Stinner621ef3d2012-10-02 00:33:47 +020013913 default:
13914 assert(!"'type' not in [diuoxX]");
Victor Stinnerd0880d52012-04-27 23:40:13 +020013915 case 'd':
Victor Stinner621ef3d2012-10-02 00:33:47 +020013916 case 'i':
Victor Stinnerd0880d52012-04-27 23:40:13 +020013917 case 'u':
Ethan Furmanfb137212013-08-31 10:18:55 -070013918 /* int and int subclasses should print numerically when a numeric */
13919 /* format code is used (see issue18780) */
13920 result = PyNumber_ToBase(val, 10);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013921 break;
13922 case 'o':
13923 numnondigits = 2;
13924 result = PyNumber_ToBase(val, 8);
13925 break;
13926 case 'x':
13927 case 'X':
13928 numnondigits = 2;
13929 result = PyNumber_ToBase(val, 16);
13930 break;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013931 }
13932 if (!result)
13933 return NULL;
13934
13935 assert(unicode_modifiable(result));
13936 assert(PyUnicode_IS_READY(result));
13937 assert(PyUnicode_IS_ASCII(result));
13938
13939 /* To modify the string in-place, there can only be one reference. */
13940 if (Py_REFCNT(result) != 1) {
Christian Heimesd47802e2013-06-29 21:33:36 +020013941 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013942 PyErr_BadInternalCall();
13943 return NULL;
13944 }
13945 buf = PyUnicode_DATA(result);
13946 llen = PyUnicode_GET_LENGTH(result);
13947 if (llen > INT_MAX) {
Christian Heimesd47802e2013-06-29 21:33:36 +020013948 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013949 PyErr_SetString(PyExc_ValueError,
13950 "string too large in _PyBytes_FormatLong");
13951 return NULL;
13952 }
13953 len = (int)llen;
13954 sign = buf[0] == '-';
13955 numnondigits += sign;
13956 numdigits = len - numnondigits;
13957 assert(numdigits > 0);
13958
13959 /* Get rid of base marker unless F_ALT */
Victor Stinnera47082312012-10-04 02:19:54 +020013960 if (((arg->flags & F_ALT) == 0 &&
Victor Stinnerd0880d52012-04-27 23:40:13 +020013961 (type == 'o' || type == 'x' || type == 'X'))) {
13962 assert(buf[sign] == '0');
13963 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
13964 buf[sign+1] == 'o');
13965 numnondigits -= 2;
13966 buf += 2;
13967 len -= 2;
13968 if (sign)
13969 buf[0] = '-';
13970 assert(len == numnondigits + numdigits);
13971 assert(numdigits > 0);
13972 }
13973
13974 /* Fill with leading zeroes to meet minimum width. */
13975 if (prec > numdigits) {
13976 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
13977 numnondigits + prec);
13978 char *b1;
13979 if (!r1) {
13980 Py_DECREF(result);
13981 return NULL;
13982 }
13983 b1 = PyBytes_AS_STRING(r1);
13984 for (i = 0; i < numnondigits; ++i)
13985 *b1++ = *buf++;
13986 for (i = 0; i < prec - numdigits; i++)
13987 *b1++ = '0';
13988 for (i = 0; i < numdigits; i++)
13989 *b1++ = *buf++;
13990 *b1 = '\0';
13991 Py_DECREF(result);
13992 result = r1;
13993 buf = PyBytes_AS_STRING(result);
13994 len = numnondigits + prec;
13995 }
13996
13997 /* Fix up case for hex conversions. */
13998 if (type == 'X') {
13999 /* Need to convert all lower case letters to upper case.
14000 and need to convert 0x to 0X (and -0x to -0X). */
14001 for (i = 0; i < len; i++)
14002 if (buf[i] >= 'a' && buf[i] <= 'x')
14003 buf[i] -= 'a'-'A';
14004 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014005 if (!PyUnicode_Check(result)
14006 || buf != PyUnicode_DATA(result)) {
Victor Stinnerd0880d52012-04-27 23:40:13 +020014007 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014008 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014009 Py_DECREF(result);
14010 result = unicode;
14011 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014012 else if (len != PyUnicode_GET_LENGTH(result)) {
14013 if (PyUnicode_Resize(&result, len) < 0)
14014 Py_CLEAR(result);
14015 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000014016 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000014017}
14018
Ethan Furmandf3ed242014-01-05 06:50:30 -080014019/* Format an integer or a float as an integer.
Victor Stinner621ef3d2012-10-02 00:33:47 +020014020 * Return 1 if the number has been formatted into the writer,
Victor Stinnera47082312012-10-04 02:19:54 +020014021 * 0 if the number has been formatted into *p_output
Victor Stinner621ef3d2012-10-02 00:33:47 +020014022 * -1 and raise an exception on error */
14023static int
Victor Stinnera47082312012-10-04 02:19:54 +020014024mainformatlong(PyObject *v,
14025 struct unicode_format_arg_t *arg,
14026 PyObject **p_output,
14027 _PyUnicodeWriter *writer)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014028{
14029 PyObject *iobj, *res;
Victor Stinnera47082312012-10-04 02:19:54 +020014030 char type = (char)arg->ch;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014031
14032 if (!PyNumber_Check(v))
14033 goto wrongtype;
14034
Ethan Furman9ab74802014-03-21 06:38:46 -070014035 /* make sure number is a type of integer for o, x, and X */
Victor Stinner621ef3d2012-10-02 00:33:47 +020014036 if (!PyLong_Check(v)) {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014037 if (type == 'o' || type == 'x' || type == 'X') {
14038 iobj = PyNumber_Index(v);
14039 if (iobj == NULL) {
Ethan Furman9ab74802014-03-21 06:38:46 -070014040 if (PyErr_ExceptionMatches(PyExc_TypeError))
14041 goto wrongtype;
Ethan Furman38d872e2014-03-19 08:38:52 -070014042 return -1;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014043 }
14044 }
14045 else {
14046 iobj = PyNumber_Long(v);
14047 if (iobj == NULL ) {
14048 if (PyErr_ExceptionMatches(PyExc_TypeError))
14049 goto wrongtype;
14050 return -1;
14051 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014052 }
14053 assert(PyLong_Check(iobj));
14054 }
14055 else {
14056 iobj = v;
14057 Py_INCREF(iobj);
14058 }
14059
14060 if (PyLong_CheckExact(v)
Victor Stinnera47082312012-10-04 02:19:54 +020014061 && arg->width == -1 && arg->prec == -1
14062 && !(arg->flags & (F_SIGN | F_BLANK))
14063 && type != 'X')
Victor Stinner621ef3d2012-10-02 00:33:47 +020014064 {
14065 /* Fast path */
Victor Stinnera47082312012-10-04 02:19:54 +020014066 int alternate = arg->flags & F_ALT;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014067 int base;
14068
Victor Stinnera47082312012-10-04 02:19:54 +020014069 switch(type)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014070 {
14071 default:
14072 assert(0 && "'type' not in [diuoxX]");
14073 case 'd':
14074 case 'i':
14075 case 'u':
14076 base = 10;
14077 break;
14078 case 'o':
14079 base = 8;
14080 break;
14081 case 'x':
14082 case 'X':
14083 base = 16;
14084 break;
14085 }
14086
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014087 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14088 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014089 return -1;
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014090 }
14091 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014092 return 1;
14093 }
14094
Victor Stinnera47082312012-10-04 02:19:54 +020014095 res = formatlong(iobj, arg);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014096 Py_DECREF(iobj);
14097 if (res == NULL)
14098 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014099 *p_output = res;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014100 return 0;
14101
14102wrongtype:
Ethan Furman9ab74802014-03-21 06:38:46 -070014103 switch(type)
14104 {
14105 case 'o':
14106 case 'x':
14107 case 'X':
14108 PyErr_Format(PyExc_TypeError,
14109 "%%%c format: an integer is required, "
14110 "not %.200s",
14111 type, Py_TYPE(v)->tp_name);
14112 break;
14113 default:
14114 PyErr_Format(PyExc_TypeError,
14115 "%%%c format: a number is required, "
14116 "not %.200s",
14117 type, Py_TYPE(v)->tp_name);
14118 break;
14119 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014120 return -1;
14121}
14122
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014123static Py_UCS4
14124formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014125{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014126 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014127 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014128 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014129 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000014130 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014131 goto onError;
14132 }
14133 else {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014134 PyObject *iobj;
Benjamin Peterson29060642009-01-31 22:14:21 +000014135 long x;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014136 /* make sure number is a type of integer */
14137 if (!PyLong_Check(v)) {
14138 iobj = PyNumber_Index(v);
14139 if (iobj == NULL) {
Ethan Furman38d872e2014-03-19 08:38:52 -070014140 goto onError;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014141 }
14142 v = iobj;
14143 Py_DECREF(iobj);
14144 }
14145 /* Integer input truncated to a character */
Benjamin Peterson29060642009-01-31 22:14:21 +000014146 x = PyLong_AsLong(v);
14147 if (x == -1 && PyErr_Occurred())
14148 goto onError;
14149
Victor Stinner8faf8212011-12-08 22:14:11 +010014150 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014151 PyErr_SetString(PyExc_OverflowError,
14152 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014153 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000014154 }
14155
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014156 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014157 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014158
Benjamin Peterson29060642009-01-31 22:14:21 +000014159 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014160 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014161 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014162 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014163}
14164
Victor Stinnera47082312012-10-04 02:19:54 +020014165/* Parse options of an argument: flags, width, precision.
14166 Handle also "%(name)" syntax.
14167
14168 Return 0 if the argument has been formatted into arg->str.
14169 Return 1 if the argument has been written into ctx->writer,
14170 Raise an exception and return -1 on error. */
14171static int
14172unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14173 struct unicode_format_arg_t *arg)
14174{
14175#define FORMAT_READ(ctx) \
14176 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14177
14178 PyObject *v;
14179
Victor Stinnera47082312012-10-04 02:19:54 +020014180 if (arg->ch == '(') {
14181 /* Get argument value from a dictionary. Example: "%(name)s". */
14182 Py_ssize_t keystart;
14183 Py_ssize_t keylen;
14184 PyObject *key;
14185 int pcount = 1;
14186
14187 if (ctx->dict == NULL) {
14188 PyErr_SetString(PyExc_TypeError,
14189 "format requires a mapping");
14190 return -1;
14191 }
14192 ++ctx->fmtpos;
14193 --ctx->fmtcnt;
14194 keystart = ctx->fmtpos;
14195 /* Skip over balanced parentheses */
14196 while (pcount > 0 && --ctx->fmtcnt >= 0) {
14197 arg->ch = FORMAT_READ(ctx);
14198 if (arg->ch == ')')
14199 --pcount;
14200 else if (arg->ch == '(')
14201 ++pcount;
14202 ctx->fmtpos++;
14203 }
14204 keylen = ctx->fmtpos - keystart - 1;
14205 if (ctx->fmtcnt < 0 || pcount > 0) {
14206 PyErr_SetString(PyExc_ValueError,
14207 "incomplete format key");
14208 return -1;
14209 }
14210 key = PyUnicode_Substring(ctx->fmtstr,
14211 keystart, keystart + keylen);
14212 if (key == NULL)
14213 return -1;
14214 if (ctx->args_owned) {
14215 Py_DECREF(ctx->args);
14216 ctx->args_owned = 0;
14217 }
14218 ctx->args = PyObject_GetItem(ctx->dict, key);
14219 Py_DECREF(key);
14220 if (ctx->args == NULL)
14221 return -1;
14222 ctx->args_owned = 1;
14223 ctx->arglen = -1;
14224 ctx->argidx = -2;
14225 }
14226
14227 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
Victor Stinnera47082312012-10-04 02:19:54 +020014228 while (--ctx->fmtcnt >= 0) {
14229 arg->ch = FORMAT_READ(ctx);
14230 ctx->fmtpos++;
14231 switch (arg->ch) {
14232 case '-': arg->flags |= F_LJUST; continue;
14233 case '+': arg->flags |= F_SIGN; continue;
14234 case ' ': arg->flags |= F_BLANK; continue;
14235 case '#': arg->flags |= F_ALT; continue;
14236 case '0': arg->flags |= F_ZERO; continue;
14237 }
14238 break;
14239 }
14240
14241 /* Parse width. Example: "%10s" => width=10 */
Victor Stinnera47082312012-10-04 02:19:54 +020014242 if (arg->ch == '*') {
14243 v = unicode_format_getnextarg(ctx);
14244 if (v == NULL)
14245 return -1;
14246 if (!PyLong_Check(v)) {
14247 PyErr_SetString(PyExc_TypeError,
14248 "* wants int");
14249 return -1;
14250 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014251 arg->width = PyLong_AsSsize_t(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014252 if (arg->width == -1 && PyErr_Occurred())
14253 return -1;
14254 if (arg->width < 0) {
14255 arg->flags |= F_LJUST;
14256 arg->width = -arg->width;
14257 }
14258 if (--ctx->fmtcnt >= 0) {
14259 arg->ch = FORMAT_READ(ctx);
14260 ctx->fmtpos++;
14261 }
14262 }
14263 else if (arg->ch >= '0' && arg->ch <= '9') {
14264 arg->width = arg->ch - '0';
14265 while (--ctx->fmtcnt >= 0) {
14266 arg->ch = FORMAT_READ(ctx);
14267 ctx->fmtpos++;
14268 if (arg->ch < '0' || arg->ch > '9')
14269 break;
14270 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14271 mixing signed and unsigned comparison. Since arg->ch is between
14272 '0' and '9', casting to int is safe. */
14273 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14274 PyErr_SetString(PyExc_ValueError,
14275 "width too big");
14276 return -1;
14277 }
14278 arg->width = arg->width*10 + (arg->ch - '0');
14279 }
14280 }
14281
14282 /* Parse precision. Example: "%.3f" => prec=3 */
Victor Stinnera47082312012-10-04 02:19:54 +020014283 if (arg->ch == '.') {
14284 arg->prec = 0;
14285 if (--ctx->fmtcnt >= 0) {
14286 arg->ch = FORMAT_READ(ctx);
14287 ctx->fmtpos++;
14288 }
14289 if (arg->ch == '*') {
14290 v = unicode_format_getnextarg(ctx);
14291 if (v == NULL)
14292 return -1;
14293 if (!PyLong_Check(v)) {
14294 PyErr_SetString(PyExc_TypeError,
14295 "* wants int");
14296 return -1;
14297 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014298 arg->prec = _PyLong_AsInt(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014299 if (arg->prec == -1 && PyErr_Occurred())
14300 return -1;
14301 if (arg->prec < 0)
14302 arg->prec = 0;
14303 if (--ctx->fmtcnt >= 0) {
14304 arg->ch = FORMAT_READ(ctx);
14305 ctx->fmtpos++;
14306 }
14307 }
14308 else if (arg->ch >= '0' && arg->ch <= '9') {
14309 arg->prec = arg->ch - '0';
14310 while (--ctx->fmtcnt >= 0) {
14311 arg->ch = FORMAT_READ(ctx);
14312 ctx->fmtpos++;
14313 if (arg->ch < '0' || arg->ch > '9')
14314 break;
14315 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
14316 PyErr_SetString(PyExc_ValueError,
Victor Stinner3921e902012-10-06 23:05:00 +020014317 "precision too big");
Victor Stinnera47082312012-10-04 02:19:54 +020014318 return -1;
14319 }
14320 arg->prec = arg->prec*10 + (arg->ch - '0');
14321 }
14322 }
14323 }
14324
14325 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
14326 if (ctx->fmtcnt >= 0) {
14327 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
14328 if (--ctx->fmtcnt >= 0) {
14329 arg->ch = FORMAT_READ(ctx);
14330 ctx->fmtpos++;
14331 }
14332 }
14333 }
14334 if (ctx->fmtcnt < 0) {
14335 PyErr_SetString(PyExc_ValueError,
14336 "incomplete format");
14337 return -1;
14338 }
14339 return 0;
14340
14341#undef FORMAT_READ
14342}
14343
14344/* Format one argument. Supported conversion specifiers:
14345
14346 - "s", "r", "a": any type
Ethan Furmandf3ed242014-01-05 06:50:30 -080014347 - "i", "d", "u": int or float
14348 - "o", "x", "X": int
Victor Stinnera47082312012-10-04 02:19:54 +020014349 - "e", "E", "f", "F", "g", "G": float
14350 - "c": int or str (1 character)
14351
Victor Stinner8dbd4212012-12-04 09:30:24 +010014352 When possible, the output is written directly into the Unicode writer
14353 (ctx->writer). A string is created when padding is required.
14354
Victor Stinnera47082312012-10-04 02:19:54 +020014355 Return 0 if the argument has been formatted into *p_str,
14356 1 if the argument has been written into ctx->writer,
Victor Stinner8dbd4212012-12-04 09:30:24 +010014357 -1 on error. */
Victor Stinnera47082312012-10-04 02:19:54 +020014358static int
14359unicode_format_arg_format(struct unicode_formatter_t *ctx,
14360 struct unicode_format_arg_t *arg,
14361 PyObject **p_str)
14362{
14363 PyObject *v;
14364 _PyUnicodeWriter *writer = &ctx->writer;
14365
14366 if (ctx->fmtcnt == 0)
14367 ctx->writer.overallocate = 0;
14368
14369 if (arg->ch == '%') {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014370 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014371 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014372 return 1;
14373 }
14374
14375 v = unicode_format_getnextarg(ctx);
14376 if (v == NULL)
14377 return -1;
14378
Victor Stinnera47082312012-10-04 02:19:54 +020014379
14380 switch (arg->ch) {
Victor Stinnera47082312012-10-04 02:19:54 +020014381 case 's':
14382 case 'r':
14383 case 'a':
14384 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
14385 /* Fast path */
14386 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
14387 return -1;
14388 return 1;
14389 }
14390
14391 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
14392 *p_str = v;
14393 Py_INCREF(*p_str);
14394 }
14395 else {
14396 if (arg->ch == 's')
14397 *p_str = PyObject_Str(v);
14398 else if (arg->ch == 'r')
14399 *p_str = PyObject_Repr(v);
14400 else
14401 *p_str = PyObject_ASCII(v);
14402 }
14403 break;
14404
14405 case 'i':
14406 case 'd':
14407 case 'u':
14408 case 'o':
14409 case 'x':
14410 case 'X':
14411 {
14412 int ret = mainformatlong(v, arg, p_str, writer);
14413 if (ret != 0)
14414 return ret;
14415 arg->sign = 1;
14416 break;
14417 }
14418
14419 case 'e':
14420 case 'E':
14421 case 'f':
14422 case 'F':
14423 case 'g':
14424 case 'G':
14425 if (arg->width == -1 && arg->prec == -1
14426 && !(arg->flags & (F_SIGN | F_BLANK)))
14427 {
14428 /* Fast path */
14429 if (formatfloat(v, arg, NULL, writer) == -1)
14430 return -1;
14431 return 1;
14432 }
14433
14434 arg->sign = 1;
14435 if (formatfloat(v, arg, p_str, NULL) == -1)
14436 return -1;
14437 break;
14438
14439 case 'c':
14440 {
14441 Py_UCS4 ch = formatchar(v);
14442 if (ch == (Py_UCS4) -1)
14443 return -1;
14444 if (arg->width == -1 && arg->prec == -1) {
14445 /* Fast path */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014446 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014447 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014448 return 1;
14449 }
14450 *p_str = PyUnicode_FromOrdinal(ch);
14451 break;
14452 }
14453
14454 default:
14455 PyErr_Format(PyExc_ValueError,
14456 "unsupported format character '%c' (0x%x) "
14457 "at index %zd",
14458 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
14459 (int)arg->ch,
14460 ctx->fmtpos - 1);
14461 return -1;
14462 }
14463 if (*p_str == NULL)
14464 return -1;
14465 assert (PyUnicode_Check(*p_str));
14466 return 0;
14467}
14468
14469static int
14470unicode_format_arg_output(struct unicode_formatter_t *ctx,
14471 struct unicode_format_arg_t *arg,
14472 PyObject *str)
14473{
14474 Py_ssize_t len;
14475 enum PyUnicode_Kind kind;
14476 void *pbuf;
14477 Py_ssize_t pindex;
14478 Py_UCS4 signchar;
14479 Py_ssize_t buflen;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014480 Py_UCS4 maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014481 Py_ssize_t sublen;
14482 _PyUnicodeWriter *writer = &ctx->writer;
14483 Py_UCS4 fill;
14484
14485 fill = ' ';
14486 if (arg->sign && arg->flags & F_ZERO)
14487 fill = '0';
14488
14489 if (PyUnicode_READY(str) == -1)
14490 return -1;
14491
14492 len = PyUnicode_GET_LENGTH(str);
14493 if ((arg->width == -1 || arg->width <= len)
14494 && (arg->prec == -1 || arg->prec >= len)
14495 && !(arg->flags & (F_SIGN | F_BLANK)))
14496 {
14497 /* Fast path */
14498 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
14499 return -1;
14500 return 0;
14501 }
14502
14503 /* Truncate the string for "s", "r" and "a" formats
14504 if the precision is set */
14505 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
14506 if (arg->prec >= 0 && len > arg->prec)
14507 len = arg->prec;
14508 }
14509
14510 /* Adjust sign and width */
14511 kind = PyUnicode_KIND(str);
14512 pbuf = PyUnicode_DATA(str);
14513 pindex = 0;
14514 signchar = '\0';
14515 if (arg->sign) {
14516 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
14517 if (ch == '-' || ch == '+') {
14518 signchar = ch;
14519 len--;
14520 pindex++;
14521 }
14522 else if (arg->flags & F_SIGN)
14523 signchar = '+';
14524 else if (arg->flags & F_BLANK)
14525 signchar = ' ';
14526 else
14527 arg->sign = 0;
14528 }
14529 if (arg->width < len)
14530 arg->width = len;
14531
14532 /* Prepare the writer */
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014533 maxchar = writer->maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014534 if (!(arg->flags & F_LJUST)) {
14535 if (arg->sign) {
14536 if ((arg->width-1) > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014537 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014538 }
14539 else {
14540 if (arg->width > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014541 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014542 }
14543 }
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014544 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
14545 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014546 maxchar = Py_MAX(maxchar, strmaxchar);
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014547 }
14548
Victor Stinnera47082312012-10-04 02:19:54 +020014549 buflen = arg->width;
14550 if (arg->sign && len == arg->width)
14551 buflen++;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014552 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
Victor Stinnera47082312012-10-04 02:19:54 +020014553 return -1;
14554
14555 /* Write the sign if needed */
14556 if (arg->sign) {
14557 if (fill != ' ') {
14558 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14559 writer->pos += 1;
14560 }
14561 if (arg->width > len)
14562 arg->width--;
14563 }
14564
14565 /* Write the numeric prefix for "x", "X" and "o" formats
14566 if the alternate form is used.
14567 For example, write "0x" for the "%#x" format. */
14568 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14569 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14570 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
14571 if (fill != ' ') {
14572 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14573 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14574 writer->pos += 2;
14575 pindex += 2;
14576 }
14577 arg->width -= 2;
14578 if (arg->width < 0)
14579 arg->width = 0;
14580 len -= 2;
14581 }
14582
14583 /* Pad left with the fill character if needed */
14584 if (arg->width > len && !(arg->flags & F_LJUST)) {
14585 sublen = arg->width - len;
14586 FILL(writer->kind, writer->data, fill, writer->pos, sublen);
14587 writer->pos += sublen;
14588 arg->width = len;
14589 }
14590
14591 /* If padding with spaces: write sign if needed and/or numeric prefix if
14592 the alternate form is used */
14593 if (fill == ' ') {
14594 if (arg->sign) {
14595 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14596 writer->pos += 1;
14597 }
14598 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14599 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14600 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
14601 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14602 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14603 writer->pos += 2;
14604 pindex += 2;
14605 }
14606 }
14607
14608 /* Write characters */
14609 if (len) {
14610 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14611 str, pindex, len);
14612 writer->pos += len;
14613 }
14614
14615 /* Pad right with the fill character if needed */
14616 if (arg->width > len) {
14617 sublen = arg->width - len;
14618 FILL(writer->kind, writer->data, ' ', writer->pos, sublen);
14619 writer->pos += sublen;
14620 }
14621 return 0;
14622}
14623
14624/* Helper of PyUnicode_Format(): format one arg.
14625 Return 0 on success, raise an exception and return -1 on error. */
14626static int
14627unicode_format_arg(struct unicode_formatter_t *ctx)
14628{
14629 struct unicode_format_arg_t arg;
14630 PyObject *str;
14631 int ret;
14632
Victor Stinner8dbd4212012-12-04 09:30:24 +010014633 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
14634 arg.flags = 0;
14635 arg.width = -1;
14636 arg.prec = -1;
14637 arg.sign = 0;
14638 str = NULL;
14639
Victor Stinnera47082312012-10-04 02:19:54 +020014640 ret = unicode_format_arg_parse(ctx, &arg);
14641 if (ret == -1)
14642 return -1;
14643
14644 ret = unicode_format_arg_format(ctx, &arg, &str);
14645 if (ret == -1)
14646 return -1;
14647
14648 if (ret != 1) {
14649 ret = unicode_format_arg_output(ctx, &arg, str);
14650 Py_DECREF(str);
14651 if (ret == -1)
14652 return -1;
14653 }
14654
14655 if (ctx->dict && (ctx->argidx < ctx->arglen) && arg.ch != '%') {
14656 PyErr_SetString(PyExc_TypeError,
14657 "not all arguments converted during string formatting");
14658 return -1;
14659 }
14660 return 0;
14661}
14662
Alexander Belopolsky40018472011-02-26 01:02:56 +000014663PyObject *
14664PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014665{
Victor Stinnera47082312012-10-04 02:19:54 +020014666 struct unicode_formatter_t ctx;
Tim Petersced69f82003-09-16 20:30:58 +000014667
Guido van Rossumd57fd912000-03-10 22:53:23 +000014668 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014669 PyErr_BadInternalCall();
14670 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014671 }
Victor Stinnera47082312012-10-04 02:19:54 +020014672
14673 ctx.fmtstr = PyUnicode_FromObject(format);
14674 if (ctx.fmtstr == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000014675 return NULL;
Victor Stinnera47082312012-10-04 02:19:54 +020014676 if (PyUnicode_READY(ctx.fmtstr) == -1) {
14677 Py_DECREF(ctx.fmtstr);
14678 return NULL;
14679 }
14680 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
14681 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
14682 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
14683 ctx.fmtpos = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014684
Victor Stinner8f674cc2013-04-17 23:02:17 +020014685 _PyUnicodeWriter_Init(&ctx.writer);
14686 ctx.writer.min_length = ctx.fmtcnt + 100;
14687 ctx.writer.overallocate = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014688
Guido van Rossumd57fd912000-03-10 22:53:23 +000014689 if (PyTuple_Check(args)) {
Victor Stinnera47082312012-10-04 02:19:54 +020014690 ctx.arglen = PyTuple_Size(args);
14691 ctx.argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014692 }
14693 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014694 ctx.arglen = -1;
14695 ctx.argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014696 }
Victor Stinnera47082312012-10-04 02:19:54 +020014697 ctx.args_owned = 0;
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040014698 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Victor Stinnera47082312012-10-04 02:19:54 +020014699 ctx.dict = args;
14700 else
14701 ctx.dict = NULL;
14702 ctx.args = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014703
Victor Stinnera47082312012-10-04 02:19:54 +020014704 while (--ctx.fmtcnt >= 0) {
14705 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
Victor Stinnercfc4c132013-04-03 01:48:39 +020014706 Py_ssize_t nonfmtpos;
Victor Stinnera47082312012-10-04 02:19:54 +020014707
14708 nonfmtpos = ctx.fmtpos++;
14709 while (ctx.fmtcnt >= 0 &&
14710 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
14711 ctx.fmtpos++;
14712 ctx.fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014713 }
Victor Stinnera47082312012-10-04 02:19:54 +020014714 if (ctx.fmtcnt < 0) {
14715 ctx.fmtpos--;
14716 ctx.writer.overallocate = 0;
Victor Stinnera0494432012-10-03 23:03:46 +020014717 }
Victor Stinneree4544c2012-05-09 22:24:08 +020014718
Victor Stinnercfc4c132013-04-03 01:48:39 +020014719 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
14720 nonfmtpos, ctx.fmtpos) < 0)
14721 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014722 }
14723 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014724 ctx.fmtpos++;
14725 if (unicode_format_arg(&ctx) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000014726 goto onError;
Victor Stinnera47082312012-10-04 02:19:54 +020014727 }
14728 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020014729
Victor Stinnera47082312012-10-04 02:19:54 +020014730 if (ctx.argidx < ctx.arglen && !ctx.dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014731 PyErr_SetString(PyExc_TypeError,
14732 "not all arguments converted during string formatting");
14733 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014734 }
14735
Victor Stinnera47082312012-10-04 02:19:54 +020014736 if (ctx.args_owned) {
14737 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014738 }
Victor Stinnera47082312012-10-04 02:19:54 +020014739 Py_DECREF(ctx.fmtstr);
14740 return _PyUnicodeWriter_Finish(&ctx.writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014741
Benjamin Peterson29060642009-01-31 22:14:21 +000014742 onError:
Victor Stinnera47082312012-10-04 02:19:54 +020014743 Py_DECREF(ctx.fmtstr);
14744 _PyUnicodeWriter_Dealloc(&ctx.writer);
14745 if (ctx.args_owned) {
14746 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014747 }
14748 return NULL;
14749}
14750
Jeremy Hylton938ace62002-07-17 16:30:39 +000014751static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000014752unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
14753
Tim Peters6d6c1a32001-08-02 04:15:00 +000014754static PyObject *
14755unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14756{
Benjamin Peterson29060642009-01-31 22:14:21 +000014757 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014758 static char *kwlist[] = {"object", "encoding", "errors", 0};
14759 char *encoding = NULL;
14760 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000014761
Benjamin Peterson14339b62009-01-31 16:36:08 +000014762 if (type != &PyUnicode_Type)
14763 return unicode_subtype_new(type, args, kwds);
14764 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000014765 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000014766 return NULL;
14767 if (x == NULL)
Serhiy Storchaka678db842013-01-26 12:16:36 +020014768 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson14339b62009-01-31 16:36:08 +000014769 if (encoding == NULL && errors == NULL)
14770 return PyObject_Str(x);
14771 else
Benjamin Peterson29060642009-01-31 22:14:21 +000014772 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000014773}
14774
Guido van Rossume023fe02001-08-30 03:12:59 +000014775static PyObject *
14776unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14777{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014778 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014779 Py_ssize_t length, char_size;
14780 int share_wstr, share_utf8;
14781 unsigned int kind;
14782 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000014783
Benjamin Peterson14339b62009-01-31 16:36:08 +000014784 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014785
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014786 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014787 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014788 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014789 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050014790 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060014791 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014792 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060014793 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014794
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014795 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014796 if (self == NULL) {
14797 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014798 return NULL;
14799 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014800 kind = PyUnicode_KIND(unicode);
14801 length = PyUnicode_GET_LENGTH(unicode);
14802
14803 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014804#ifdef Py_DEBUG
14805 _PyUnicode_HASH(self) = -1;
14806#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014807 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014808#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014809 _PyUnicode_STATE(self).interned = 0;
14810 _PyUnicode_STATE(self).kind = kind;
14811 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020014812 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014813 _PyUnicode_STATE(self).ready = 1;
14814 _PyUnicode_WSTR(self) = NULL;
14815 _PyUnicode_UTF8_LENGTH(self) = 0;
14816 _PyUnicode_UTF8(self) = NULL;
14817 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020014818 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014819
14820 share_utf8 = 0;
14821 share_wstr = 0;
14822 if (kind == PyUnicode_1BYTE_KIND) {
14823 char_size = 1;
14824 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
14825 share_utf8 = 1;
14826 }
14827 else if (kind == PyUnicode_2BYTE_KIND) {
14828 char_size = 2;
14829 if (sizeof(wchar_t) == 2)
14830 share_wstr = 1;
14831 }
14832 else {
14833 assert(kind == PyUnicode_4BYTE_KIND);
14834 char_size = 4;
14835 if (sizeof(wchar_t) == 4)
14836 share_wstr = 1;
14837 }
14838
14839 /* Ensure we won't overflow the length. */
14840 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
14841 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014842 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014843 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014844 data = PyObject_MALLOC((length + 1) * char_size);
14845 if (data == NULL) {
14846 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014847 goto onError;
14848 }
14849
Victor Stinnerc3c74152011-10-02 20:39:55 +020014850 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014851 if (share_utf8) {
14852 _PyUnicode_UTF8_LENGTH(self) = length;
14853 _PyUnicode_UTF8(self) = data;
14854 }
14855 if (share_wstr) {
14856 _PyUnicode_WSTR_LENGTH(self) = length;
14857 _PyUnicode_WSTR(self) = (wchar_t *)data;
14858 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014859
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014860 Py_MEMCPY(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020014861 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014862 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014863#ifdef Py_DEBUG
14864 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
14865#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020014866 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010014867 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014868
14869onError:
14870 Py_DECREF(unicode);
14871 Py_DECREF(self);
14872 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000014873}
14874
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000014875PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070014876"str(object='') -> str\n\
14877str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000014878\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100014879Create a new string object from the given object. If encoding or\n\
14880errors is specified, then the object must expose a data buffer\n\
14881that will be decoded using the given encoding and error handler.\n\
14882Otherwise, returns the result of object.__str__() (if defined)\n\
14883or repr(object).\n\
14884encoding defaults to sys.getdefaultencoding().\n\
14885errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000014886
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014887static PyObject *unicode_iter(PyObject *seq);
14888
Guido van Rossumd57fd912000-03-10 22:53:23 +000014889PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000014890 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014891 "str", /* tp_name */
14892 sizeof(PyUnicodeObject), /* tp_size */
14893 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014894 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014895 (destructor)unicode_dealloc, /* tp_dealloc */
14896 0, /* tp_print */
14897 0, /* tp_getattr */
14898 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014899 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014900 unicode_repr, /* tp_repr */
14901 &unicode_as_number, /* tp_as_number */
14902 &unicode_as_sequence, /* tp_as_sequence */
14903 &unicode_as_mapping, /* tp_as_mapping */
14904 (hashfunc) unicode_hash, /* tp_hash*/
14905 0, /* tp_call*/
14906 (reprfunc) unicode_str, /* tp_str */
14907 PyObject_GenericGetAttr, /* tp_getattro */
14908 0, /* tp_setattro */
14909 0, /* tp_as_buffer */
14910 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000014911 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014912 unicode_doc, /* tp_doc */
14913 0, /* tp_traverse */
14914 0, /* tp_clear */
14915 PyUnicode_RichCompare, /* tp_richcompare */
14916 0, /* tp_weaklistoffset */
14917 unicode_iter, /* tp_iter */
14918 0, /* tp_iternext */
14919 unicode_methods, /* tp_methods */
14920 0, /* tp_members */
14921 0, /* tp_getset */
14922 &PyBaseObject_Type, /* tp_base */
14923 0, /* tp_dict */
14924 0, /* tp_descr_get */
14925 0, /* tp_descr_set */
14926 0, /* tp_dictoffset */
14927 0, /* tp_init */
14928 0, /* tp_alloc */
14929 unicode_new, /* tp_new */
14930 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014931};
14932
14933/* Initialize the Unicode implementation */
14934
Victor Stinner3a50e702011-10-18 21:21:00 +020014935int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014936{
Thomas Wouters477c8d52006-05-27 19:21:47 +000014937 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014938 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000014939 0x000A, /* LINE FEED */
14940 0x000D, /* CARRIAGE RETURN */
14941 0x001C, /* FILE SEPARATOR */
14942 0x001D, /* GROUP SEPARATOR */
14943 0x001E, /* RECORD SEPARATOR */
14944 0x0085, /* NEXT LINE */
14945 0x2028, /* LINE SEPARATOR */
14946 0x2029, /* PARAGRAPH SEPARATOR */
14947 };
14948
Fred Drakee4315f52000-05-09 19:53:39 +000014949 /* Init the implementation */
Serhiy Storchaka678db842013-01-26 12:16:36 +020014950 _Py_INCREF_UNICODE_EMPTY();
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014951 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014952 Py_FatalError("Can't create empty string");
Serhiy Storchaka678db842013-01-26 12:16:36 +020014953 Py_DECREF(unicode_empty);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014954
Guido van Rossumcacfc072002-05-24 19:01:59 +000014955 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014956 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000014957
14958 /* initialize the linebreak bloom filter */
14959 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014960 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020014961 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014962
Christian Heimes26532f72013-07-20 14:57:16 +020014963 if (PyType_Ready(&EncodingMapType) < 0)
14964 Py_FatalError("Can't initialize encoding map type");
Victor Stinner3a50e702011-10-18 21:21:00 +020014965
Benjamin Petersonc4311282012-10-30 23:21:10 -040014966 if (PyType_Ready(&PyFieldNameIter_Type) < 0)
14967 Py_FatalError("Can't initialize field name iterator type");
14968
14969 if (PyType_Ready(&PyFormatterIter_Type) < 0)
14970 Py_FatalError("Can't initialize formatter iter type");
Benjamin Petersone8ea97f2012-10-30 23:27:52 -040014971
Victor Stinner3a50e702011-10-18 21:21:00 +020014972#ifdef HAVE_MBCS
14973 winver.dwOSVersionInfoSize = sizeof(winver);
14974 if (!GetVersionEx((OSVERSIONINFO*)&winver)) {
14975 PyErr_SetFromWindowsErr(0);
14976 return -1;
14977 }
14978#endif
14979 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014980}
14981
14982/* Finalize the Unicode implementation */
14983
Christian Heimesa156e092008-02-16 07:38:31 +000014984int
14985PyUnicode_ClearFreeList(void)
14986{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014987 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000014988}
14989
Guido van Rossumd57fd912000-03-10 22:53:23 +000014990void
Thomas Wouters78890102000-07-22 19:25:51 +000014991_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014992{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014993 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014994
Serhiy Storchaka05997252013-01-26 12:14:02 +020014995 Py_CLEAR(unicode_empty);
Barry Warsaw5b4c2282000-10-03 20:45:26 +000014996
Serhiy Storchaka05997252013-01-26 12:14:02 +020014997 for (i = 0; i < 256; i++)
14998 Py_CLEAR(unicode_latin1[i]);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020014999 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000015000 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000015001}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000015002
Walter Dörwald16807132007-05-25 13:52:07 +000015003void
15004PyUnicode_InternInPlace(PyObject **p)
15005{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015006 PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015007 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020015008#ifdef Py_DEBUG
15009 assert(s != NULL);
15010 assert(_PyUnicode_CHECK(s));
15011#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000015012 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020015013 return;
15014#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015015 /* If it's a subclass, we don't really know what putting
15016 it in the interned dict might do. */
15017 if (!PyUnicode_CheckExact(s))
15018 return;
15019 if (PyUnicode_CHECK_INTERNED(s))
15020 return;
15021 if (interned == NULL) {
15022 interned = PyDict_New();
15023 if (interned == NULL) {
15024 PyErr_Clear(); /* Don't leave an exception */
15025 return;
15026 }
15027 }
15028 /* It might be that the GetItem call fails even
15029 though the key is present in the dictionary,
15030 namely when this happens during a stack overflow. */
15031 Py_ALLOW_RECURSION
Victor Stinner7931d9a2011-11-04 00:22:48 +010015032 t = PyDict_GetItem(interned, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015033 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000015034
Victor Stinnerf0335102013-04-14 19:13:03 +020015035 if (t) {
15036 Py_INCREF(t);
15037 Py_DECREF(*p);
15038 *p = t;
15039 return;
15040 }
Walter Dörwald16807132007-05-25 13:52:07 +000015041
Benjamin Peterson14339b62009-01-31 16:36:08 +000015042 PyThreadState_GET()->recursion_critical = 1;
Victor Stinner7931d9a2011-11-04 00:22:48 +010015043 if (PyDict_SetItem(interned, s, s) < 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015044 PyErr_Clear();
15045 PyThreadState_GET()->recursion_critical = 0;
15046 return;
15047 }
15048 PyThreadState_GET()->recursion_critical = 0;
15049 /* The two references in interned are not counted by refcnt.
15050 The deallocator will take care of this */
15051 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015052 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000015053}
15054
15055void
15056PyUnicode_InternImmortal(PyObject **p)
15057{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015058 PyUnicode_InternInPlace(p);
15059 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020015060 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015061 Py_INCREF(*p);
15062 }
Walter Dörwald16807132007-05-25 13:52:07 +000015063}
15064
15065PyObject *
15066PyUnicode_InternFromString(const char *cp)
15067{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015068 PyObject *s = PyUnicode_FromString(cp);
15069 if (s == NULL)
15070 return NULL;
15071 PyUnicode_InternInPlace(&s);
15072 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000015073}
15074
Alexander Belopolsky40018472011-02-26 01:02:56 +000015075void
15076_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000015077{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015078 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015079 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015080 Py_ssize_t i, n;
15081 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000015082
Benjamin Peterson14339b62009-01-31 16:36:08 +000015083 if (interned == NULL || !PyDict_Check(interned))
15084 return;
15085 keys = PyDict_Keys(interned);
15086 if (keys == NULL || !PyList_Check(keys)) {
15087 PyErr_Clear();
15088 return;
15089 }
Walter Dörwald16807132007-05-25 13:52:07 +000015090
Benjamin Peterson14339b62009-01-31 16:36:08 +000015091 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
15092 detector, interned unicode strings are not forcibly deallocated;
15093 rather, we give them their stolen references back, and then clear
15094 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000015095
Benjamin Peterson14339b62009-01-31 16:36:08 +000015096 n = PyList_GET_SIZE(keys);
15097 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000015098 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015099 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015100 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015101 if (PyUnicode_READY(s) == -1) {
15102 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015103 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015104 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015105 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015106 case SSTATE_NOT_INTERNED:
15107 /* XXX Shouldn't happen */
15108 break;
15109 case SSTATE_INTERNED_IMMORTAL:
15110 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015111 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015112 break;
15113 case SSTATE_INTERNED_MORTAL:
15114 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015115 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015116 break;
15117 default:
15118 Py_FatalError("Inconsistent interned string state.");
15119 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015120 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015121 }
15122 fprintf(stderr, "total size of all interned strings: "
15123 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
15124 "mortal/immortal\n", mortal_size, immortal_size);
15125 Py_DECREF(keys);
15126 PyDict_Clear(interned);
Serhiy Storchaka05997252013-01-26 12:14:02 +020015127 Py_CLEAR(interned);
Walter Dörwald16807132007-05-25 13:52:07 +000015128}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015129
15130
15131/********************* Unicode Iterator **************************/
15132
15133typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015134 PyObject_HEAD
15135 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015136 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015137} unicodeiterobject;
15138
15139static void
15140unicodeiter_dealloc(unicodeiterobject *it)
15141{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015142 _PyObject_GC_UNTRACK(it);
15143 Py_XDECREF(it->it_seq);
15144 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015145}
15146
15147static int
15148unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15149{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015150 Py_VISIT(it->it_seq);
15151 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015152}
15153
15154static PyObject *
15155unicodeiter_next(unicodeiterobject *it)
15156{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015157 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015158
Benjamin Peterson14339b62009-01-31 16:36:08 +000015159 assert(it != NULL);
15160 seq = it->it_seq;
15161 if (seq == NULL)
15162 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015163 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015164
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015165 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15166 int kind = PyUnicode_KIND(seq);
15167 void *data = PyUnicode_DATA(seq);
15168 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15169 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015170 if (item != NULL)
15171 ++it->it_index;
15172 return item;
15173 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015174
Benjamin Peterson14339b62009-01-31 16:36:08 +000015175 Py_DECREF(seq);
15176 it->it_seq = NULL;
15177 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015178}
15179
15180static PyObject *
15181unicodeiter_len(unicodeiterobject *it)
15182{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015183 Py_ssize_t len = 0;
15184 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020015185 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015186 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015187}
15188
15189PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15190
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015191static PyObject *
15192unicodeiter_reduce(unicodeiterobject *it)
15193{
15194 if (it->it_seq != NULL) {
Antoine Pitroua7013882012-04-05 00:04:20 +020015195 return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015196 it->it_seq, it->it_index);
15197 } else {
15198 PyObject *u = PyUnicode_FromUnicode(NULL, 0);
15199 if (u == NULL)
15200 return NULL;
Antoine Pitroua7013882012-04-05 00:04:20 +020015201 return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015202 }
15203}
15204
15205PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15206
15207static PyObject *
15208unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15209{
15210 Py_ssize_t index = PyLong_AsSsize_t(state);
15211 if (index == -1 && PyErr_Occurred())
15212 return NULL;
Kristján Valur Jónsson25dded02014-03-05 13:47:57 +000015213 if (it->it_seq != NULL) {
15214 if (index < 0)
15215 index = 0;
15216 else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15217 index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15218 it->it_index = index;
15219 }
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015220 Py_RETURN_NONE;
15221}
15222
15223PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15224
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015225static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015226 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000015227 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015228 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15229 reduce_doc},
15230 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
15231 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000015232 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015233};
15234
15235PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015236 PyVarObject_HEAD_INIT(&PyType_Type, 0)
15237 "str_iterator", /* tp_name */
15238 sizeof(unicodeiterobject), /* tp_basicsize */
15239 0, /* tp_itemsize */
15240 /* methods */
15241 (destructor)unicodeiter_dealloc, /* tp_dealloc */
15242 0, /* tp_print */
15243 0, /* tp_getattr */
15244 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000015245 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015246 0, /* tp_repr */
15247 0, /* tp_as_number */
15248 0, /* tp_as_sequence */
15249 0, /* tp_as_mapping */
15250 0, /* tp_hash */
15251 0, /* tp_call */
15252 0, /* tp_str */
15253 PyObject_GenericGetAttr, /* tp_getattro */
15254 0, /* tp_setattro */
15255 0, /* tp_as_buffer */
15256 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15257 0, /* tp_doc */
15258 (traverseproc)unicodeiter_traverse, /* tp_traverse */
15259 0, /* tp_clear */
15260 0, /* tp_richcompare */
15261 0, /* tp_weaklistoffset */
15262 PyObject_SelfIter, /* tp_iter */
15263 (iternextfunc)unicodeiter_next, /* tp_iternext */
15264 unicodeiter_methods, /* tp_methods */
15265 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015266};
15267
15268static PyObject *
15269unicode_iter(PyObject *seq)
15270{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015271 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015272
Benjamin Peterson14339b62009-01-31 16:36:08 +000015273 if (!PyUnicode_Check(seq)) {
15274 PyErr_BadInternalCall();
15275 return NULL;
15276 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015277 if (PyUnicode_READY(seq) == -1)
15278 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015279 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15280 if (it == NULL)
15281 return NULL;
15282 it->it_index = 0;
15283 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015284 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015285 _PyObject_GC_TRACK(it);
15286 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015287}
15288
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015289
15290size_t
15291Py_UNICODE_strlen(const Py_UNICODE *u)
15292{
15293 int res = 0;
15294 while(*u++)
15295 res++;
15296 return res;
15297}
15298
15299Py_UNICODE*
15300Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
15301{
15302 Py_UNICODE *u = s1;
15303 while ((*u++ = *s2++));
15304 return s1;
15305}
15306
15307Py_UNICODE*
15308Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15309{
15310 Py_UNICODE *u = s1;
15311 while ((*u++ = *s2++))
15312 if (n-- == 0)
15313 break;
15314 return s1;
15315}
15316
15317Py_UNICODE*
15318Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
15319{
15320 Py_UNICODE *u1 = s1;
15321 u1 += Py_UNICODE_strlen(u1);
15322 Py_UNICODE_strcpy(u1, s2);
15323 return s1;
15324}
15325
15326int
15327Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
15328{
15329 while (*s1 && *s2 && *s1 == *s2)
15330 s1++, s2++;
15331 if (*s1 && *s2)
15332 return (*s1 < *s2) ? -1 : +1;
15333 if (*s1)
15334 return 1;
15335 if (*s2)
15336 return -1;
15337 return 0;
15338}
15339
15340int
15341Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15342{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015343 Py_UNICODE u1, u2;
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015344 for (; n != 0; n--) {
15345 u1 = *s1;
15346 u2 = *s2;
15347 if (u1 != u2)
15348 return (u1 < u2) ? -1 : +1;
15349 if (u1 == '\0')
15350 return 0;
15351 s1++;
15352 s2++;
15353 }
15354 return 0;
15355}
15356
15357Py_UNICODE*
15358Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
15359{
15360 const Py_UNICODE *p;
15361 for (p = s; *p; p++)
15362 if (*p == c)
15363 return (Py_UNICODE*)p;
15364 return NULL;
15365}
15366
15367Py_UNICODE*
15368Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
15369{
15370 const Py_UNICODE *p;
15371 p = s + Py_UNICODE_strlen(s);
15372 while (p != s) {
15373 p--;
15374 if (*p == c)
15375 return (Py_UNICODE*)p;
15376 }
15377 return NULL;
15378}
Victor Stinner331ea922010-08-10 16:37:20 +000015379
Victor Stinner71133ff2010-09-01 23:43:53 +000015380Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015381PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000015382{
Victor Stinner577db2c2011-10-11 22:12:48 +020015383 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015384 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000015385
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015386 if (!PyUnicode_Check(unicode)) {
15387 PyErr_BadArgument();
15388 return NULL;
15389 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015390 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020015391 if (u == NULL)
15392 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000015393 /* Ensure we won't overflow the size. */
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015394 if (len > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000015395 PyErr_NoMemory();
15396 return NULL;
15397 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015398 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000015399 size *= sizeof(Py_UNICODE);
15400 copy = PyMem_Malloc(size);
15401 if (copy == NULL) {
15402 PyErr_NoMemory();
15403 return NULL;
15404 }
Victor Stinner577db2c2011-10-11 22:12:48 +020015405 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000015406 return copy;
15407}
Martin v. Löwis5b222132007-06-10 09:51:05 +000015408
Georg Brandl66c221e2010-10-14 07:04:07 +000015409/* A _string module, to export formatter_parser and formatter_field_name_split
15410 to the string.Formatter class implemented in Python. */
15411
15412static PyMethodDef _string_methods[] = {
15413 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
15414 METH_O, PyDoc_STR("split the argument as a field name")},
15415 {"formatter_parser", (PyCFunction) formatter_parser,
15416 METH_O, PyDoc_STR("parse the argument as a format string")},
15417 {NULL, NULL}
15418};
15419
15420static struct PyModuleDef _string_module = {
15421 PyModuleDef_HEAD_INIT,
15422 "_string",
15423 PyDoc_STR("string helper module"),
15424 0,
15425 _string_methods,
15426 NULL,
15427 NULL,
15428 NULL,
15429 NULL
15430};
15431
15432PyMODINIT_FUNC
15433PyInit__string(void)
15434{
15435 return PyModule_Create(&_string_module);
15436}
15437
15438
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000015439#ifdef __cplusplus
15440}
15441#endif