blob: 34d51e404cc4012428ea232a2a83e442ac5ede00 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000046#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000047#include <windows.h>
48#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000049
Larry Hastings44e2eaa2013-11-23 15:37:55 -080050/*[clinic]
51class str
52[clinic]*/
53/*[clinic checksum: da39a3ee5e6b4b0d3255bfef95601890afd80709]*/
54
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000055/* --- Globals ------------------------------------------------------------
56
Serhiy Storchaka05997252013-01-26 12:14:02 +020057NOTE: In the interpreter's initialization phase, some globals are currently
58 initialized dynamically as needed. In the process Unicode objects may
59 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000060
61*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000062
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000063
64#ifdef __cplusplus
65extern "C" {
66#endif
67
Victor Stinner8faf8212011-12-08 22:14:11 +010068/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
69#define MAX_UNICODE 0x10ffff
70
Victor Stinner910337b2011-10-03 03:20:16 +020071#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020072# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020073#else
74# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
75#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020076
Victor Stinnere90fe6a2011-10-01 16:48:13 +020077#define _PyUnicode_UTF8(op) \
78 (((PyCompactUnicodeObject*)(op))->utf8)
79#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020080 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020081 assert(PyUnicode_IS_READY(op)), \
82 PyUnicode_IS_COMPACT_ASCII(op) ? \
83 ((char*)((PyASCIIObject*)(op) + 1)) : \
84 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +020085#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020086 (((PyCompactUnicodeObject*)(op))->utf8_length)
87#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020088 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020089 assert(PyUnicode_IS_READY(op)), \
90 PyUnicode_IS_COMPACT_ASCII(op) ? \
91 ((PyASCIIObject*)(op))->length : \
92 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +020093#define _PyUnicode_WSTR(op) \
94 (((PyASCIIObject*)(op))->wstr)
95#define _PyUnicode_WSTR_LENGTH(op) \
96 (((PyCompactUnicodeObject*)(op))->wstr_length)
97#define _PyUnicode_LENGTH(op) \
98 (((PyASCIIObject *)(op))->length)
99#define _PyUnicode_STATE(op) \
100 (((PyASCIIObject *)(op))->state)
101#define _PyUnicode_HASH(op) \
102 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200103#define _PyUnicode_KIND(op) \
104 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200105 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200106#define _PyUnicode_GET_LENGTH(op) \
107 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200108 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200109#define _PyUnicode_DATA_ANY(op) \
110 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200111
Victor Stinner910337b2011-10-03 03:20:16 +0200112#undef PyUnicode_READY
113#define PyUnicode_READY(op) \
114 (assert(_PyUnicode_CHECK(op)), \
115 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200116 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100117 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200118
Victor Stinnerc379ead2011-10-03 12:52:27 +0200119#define _PyUnicode_SHARE_UTF8(op) \
120 (assert(_PyUnicode_CHECK(op)), \
121 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
122 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
123#define _PyUnicode_SHARE_WSTR(op) \
124 (assert(_PyUnicode_CHECK(op)), \
125 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
126
Victor Stinner829c0ad2011-10-03 01:08:02 +0200127/* true if the Unicode object has an allocated UTF-8 memory block
128 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200129#define _PyUnicode_HAS_UTF8_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200130 ((!PyUnicode_IS_COMPACT_ASCII(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200131 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200132 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
133
Victor Stinner03490912011-10-03 23:45:12 +0200134/* true if the Unicode object has an allocated wstr memory block
135 (not shared with other data) */
136#define _PyUnicode_HAS_WSTR_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200137 ((_PyUnicode_WSTR(op) && \
Victor Stinner03490912011-10-03 23:45:12 +0200138 (!PyUnicode_IS_READY(op) || \
139 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
140
Victor Stinner910337b2011-10-03 03:20:16 +0200141/* Generic helper macro to convert characters of different types.
142 from_type and to_type have to be valid type names, begin and end
143 are pointers to the source characters which should be of type
144 "from_type *". to is a pointer of type "to_type *" and points to the
145 buffer where the result characters are written to. */
146#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
147 do { \
Victor Stinner4a587072013-11-19 12:54:53 +0100148 to_type *_to = (to_type *)(to); \
149 const from_type *_iter = (from_type *)(begin); \
150 const from_type *_end = (from_type *)(end); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200151 Py_ssize_t n = (_end) - (_iter); \
152 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200153 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200154 while (_iter < (_unrolled_end)) { \
155 _to[0] = (to_type) _iter[0]; \
156 _to[1] = (to_type) _iter[1]; \
157 _to[2] = (to_type) _iter[2]; \
158 _to[3] = (to_type) _iter[3]; \
159 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200160 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200161 while (_iter < (_end)) \
162 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200163 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200164
Walter Dörwald16807132007-05-25 13:52:07 +0000165/* This dictionary holds all interned unicode strings. Note that references
166 to strings in this dictionary are *not* counted in the string's ob_refcnt.
167 When the interned string reaches a refcnt of 0 the string deallocation
168 function will delete the reference from this dictionary.
169
170 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000171 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000172*/
Serhiy Storchaka05997252013-01-26 12:14:02 +0200173static PyObject *interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +0000174
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000175/* The empty Unicode object is shared to improve performance. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200176static PyObject *unicode_empty = NULL;
Serhiy Storchaka05997252013-01-26 12:14:02 +0200177
Serhiy Storchaka678db842013-01-26 12:16:36 +0200178#define _Py_INCREF_UNICODE_EMPTY() \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200179 do { \
180 if (unicode_empty != NULL) \
181 Py_INCREF(unicode_empty); \
182 else { \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200183 unicode_empty = PyUnicode_New(0, 0); \
184 if (unicode_empty != NULL) { \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200185 Py_INCREF(unicode_empty); \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200186 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
187 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200188 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200189 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000190
Serhiy Storchaka678db842013-01-26 12:16:36 +0200191#define _Py_RETURN_UNICODE_EMPTY() \
192 do { \
193 _Py_INCREF_UNICODE_EMPTY(); \
194 return unicode_empty; \
195 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000196
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200197/* Forward declaration */
198Py_LOCAL_INLINE(int)
199_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
200
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200201/* List of static strings. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200202static _Py_Identifier *static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200203
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000204/* Single character Unicode strings in the Latin-1 range are being
205 shared as well. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200206static PyObject *unicode_latin1[256] = {NULL};
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000207
Christian Heimes190d79e2008-01-30 11:58:22 +0000208/* Fast detection of the most frequent whitespace characters */
209const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000210 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000211/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000212/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000213/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000214/* case 0x000C: * FORM FEED */
215/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000216 0, 1, 1, 1, 1, 1, 0, 0,
217 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000218/* case 0x001C: * FILE SEPARATOR */
219/* case 0x001D: * GROUP SEPARATOR */
220/* case 0x001E: * RECORD SEPARATOR */
221/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000222 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000223/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000224 1, 0, 0, 0, 0, 0, 0, 0,
225 0, 0, 0, 0, 0, 0, 0, 0,
226 0, 0, 0, 0, 0, 0, 0, 0,
227 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000228
Benjamin Peterson14339b62009-01-31 16:36:08 +0000229 0, 0, 0, 0, 0, 0, 0, 0,
230 0, 0, 0, 0, 0, 0, 0, 0,
231 0, 0, 0, 0, 0, 0, 0, 0,
232 0, 0, 0, 0, 0, 0, 0, 0,
233 0, 0, 0, 0, 0, 0, 0, 0,
234 0, 0, 0, 0, 0, 0, 0, 0,
235 0, 0, 0, 0, 0, 0, 0, 0,
236 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000237};
238
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200239/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200240static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200241static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100242static int unicode_modifiable(PyObject *unicode);
243
Victor Stinnerfe226c02011-10-03 03:52:20 +0200244
Alexander Belopolsky40018472011-02-26 01:02:56 +0000245static PyObject *
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100246_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200247static PyObject *
248_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
249static PyObject *
250_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
251
252static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000253unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000254 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100255 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000256 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
257
Alexander Belopolsky40018472011-02-26 01:02:56 +0000258static void
259raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300260 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100261 PyObject *unicode,
262 Py_ssize_t startpos, Py_ssize_t endpos,
263 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000264
Christian Heimes190d79e2008-01-30 11:58:22 +0000265/* Same for linebreaks */
266static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000267 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000268/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000269/* 0x000B, * LINE TABULATION */
270/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000271/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000272 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000273 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000274/* 0x001C, * FILE SEPARATOR */
275/* 0x001D, * GROUP SEPARATOR */
276/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000277 0, 0, 0, 0, 1, 1, 1, 0,
278 0, 0, 0, 0, 0, 0, 0, 0,
279 0, 0, 0, 0, 0, 0, 0, 0,
280 0, 0, 0, 0, 0, 0, 0, 0,
281 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000282
Benjamin Peterson14339b62009-01-31 16:36:08 +0000283 0, 0, 0, 0, 0, 0, 0, 0,
284 0, 0, 0, 0, 0, 0, 0, 0,
285 0, 0, 0, 0, 0, 0, 0, 0,
286 0, 0, 0, 0, 0, 0, 0, 0,
287 0, 0, 0, 0, 0, 0, 0, 0,
288 0, 0, 0, 0, 0, 0, 0, 0,
289 0, 0, 0, 0, 0, 0, 0, 0,
290 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000291};
292
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300293/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
294 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000295Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000296PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000297{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000298#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000299 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000300#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000301 /* This is actually an illegal character, so it should
302 not be passed to unichr. */
303 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000304#endif
305}
306
Victor Stinner910337b2011-10-03 03:20:16 +0200307#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200308int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100309_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200310{
311 PyASCIIObject *ascii;
312 unsigned int kind;
313
314 assert(PyUnicode_Check(op));
315
316 ascii = (PyASCIIObject *)op;
317 kind = ascii->state.kind;
318
Victor Stinnera3b334d2011-10-03 13:53:37 +0200319 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200320 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200321 assert(ascii->state.ready == 1);
322 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200323 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200324 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200325 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200326
Victor Stinnera41463c2011-10-04 01:05:08 +0200327 if (ascii->state.compact == 1) {
328 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200329 assert(kind == PyUnicode_1BYTE_KIND
330 || kind == PyUnicode_2BYTE_KIND
331 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200332 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200333 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200334 assert (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100335 }
336 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200337 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
338
339 data = unicode->data.any;
340 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnere30c0a12011-11-04 20:54:05 +0100341 assert(ascii->length == 0);
342 assert(ascii->hash == -1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200343 assert(ascii->state.compact == 0);
344 assert(ascii->state.ascii == 0);
345 assert(ascii->state.ready == 0);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100346 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
Victor Stinnera41463c2011-10-04 01:05:08 +0200347 assert(ascii->wstr != NULL);
348 assert(data == NULL);
349 assert(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200350 }
351 else {
352 assert(kind == PyUnicode_1BYTE_KIND
353 || kind == PyUnicode_2BYTE_KIND
354 || kind == PyUnicode_4BYTE_KIND);
355 assert(ascii->state.compact == 0);
356 assert(ascii->state.ready == 1);
357 assert(data != NULL);
358 if (ascii->state.ascii) {
359 assert (compact->utf8 == data);
360 assert (compact->utf8_length == ascii->length);
361 }
362 else
363 assert (compact->utf8 != data);
364 }
365 }
366 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200367 if (
368#if SIZEOF_WCHAR_T == 2
369 kind == PyUnicode_2BYTE_KIND
370#else
371 kind == PyUnicode_4BYTE_KIND
372#endif
373 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200374 {
375 assert(ascii->wstr == data);
376 assert(compact->wstr_length == ascii->length);
377 } else
378 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200379 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200380
381 if (compact->utf8 == NULL)
382 assert(compact->utf8_length == 0);
383 if (ascii->wstr == NULL)
384 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200385 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200386 /* check that the best kind is used */
387 if (check_content && kind != PyUnicode_WCHAR_KIND)
388 {
389 Py_ssize_t i;
390 Py_UCS4 maxchar = 0;
Victor Stinner718fbf02012-04-26 00:39:37 +0200391 void *data;
392 Py_UCS4 ch;
393
394 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200395 for (i=0; i < ascii->length; i++)
396 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200397 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200398 if (ch > maxchar)
399 maxchar = ch;
400 }
401 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100402 if (ascii->state.ascii == 0) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200403 assert(maxchar >= 128);
Victor Stinner77faf692011-11-20 18:56:05 +0100404 assert(maxchar <= 255);
405 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200406 else
407 assert(maxchar < 128);
408 }
Victor Stinner77faf692011-11-20 18:56:05 +0100409 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200410 assert(maxchar >= 0x100);
Victor Stinner77faf692011-11-20 18:56:05 +0100411 assert(maxchar <= 0xFFFF);
412 }
413 else {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200414 assert(maxchar >= 0x10000);
Victor Stinner8faf8212011-12-08 22:14:11 +0100415 assert(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100416 }
Victor Stinner718fbf02012-04-26 00:39:37 +0200417 assert(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200418 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400419 return 1;
420}
Victor Stinner910337b2011-10-03 03:20:16 +0200421#endif
422
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100423static PyObject*
424unicode_result_wchar(PyObject *unicode)
425{
426#ifndef Py_DEBUG
427 Py_ssize_t len;
428
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100429 len = _PyUnicode_WSTR_LENGTH(unicode);
430 if (len == 0) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100431 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200432 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100433 }
434
435 if (len == 1) {
436 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100437 if ((Py_UCS4)ch < 256) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100438 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
439 Py_DECREF(unicode);
440 return latin1_char;
441 }
442 }
443
444 if (_PyUnicode_Ready(unicode) < 0) {
Victor Stinneraa771272012-10-04 02:32:58 +0200445 Py_DECREF(unicode);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100446 return NULL;
447 }
448#else
Victor Stinneraa771272012-10-04 02:32:58 +0200449 assert(Py_REFCNT(unicode) == 1);
450
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100451 /* don't make the result ready in debug mode to ensure that the caller
452 makes the string ready before using it */
453 assert(_PyUnicode_CheckConsistency(unicode, 1));
454#endif
455 return unicode;
456}
457
458static PyObject*
459unicode_result_ready(PyObject *unicode)
460{
461 Py_ssize_t length;
462
463 length = PyUnicode_GET_LENGTH(unicode);
464 if (length == 0) {
465 if (unicode != unicode_empty) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100466 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200467 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100468 }
469 return unicode_empty;
470 }
471
472 if (length == 1) {
Victor Stinner69ed0f42013-04-09 21:48:24 +0200473 void *data = PyUnicode_DATA(unicode);
474 int kind = PyUnicode_KIND(unicode);
475 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100476 if (ch < 256) {
477 PyObject *latin1_char = unicode_latin1[ch];
478 if (latin1_char != NULL) {
479 if (unicode != latin1_char) {
480 Py_INCREF(latin1_char);
481 Py_DECREF(unicode);
482 }
483 return latin1_char;
484 }
485 else {
486 assert(_PyUnicode_CheckConsistency(unicode, 1));
487 Py_INCREF(unicode);
488 unicode_latin1[ch] = unicode;
489 return unicode;
490 }
491 }
492 }
493
494 assert(_PyUnicode_CheckConsistency(unicode, 1));
495 return unicode;
496}
497
498static PyObject*
499unicode_result(PyObject *unicode)
500{
501 assert(_PyUnicode_CHECK(unicode));
502 if (PyUnicode_IS_READY(unicode))
503 return unicode_result_ready(unicode);
504 else
505 return unicode_result_wchar(unicode);
506}
507
Victor Stinnerc4b49542011-12-11 22:44:26 +0100508static PyObject*
509unicode_result_unchanged(PyObject *unicode)
510{
511 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500512 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100513 return NULL;
514 Py_INCREF(unicode);
515 return unicode;
516 }
517 else
518 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100519 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100520}
521
Victor Stinner3a50e702011-10-18 21:21:00 +0200522#ifdef HAVE_MBCS
523static OSVERSIONINFOEX winver;
524#endif
525
Thomas Wouters477c8d52006-05-27 19:21:47 +0000526/* --- Bloom Filters ----------------------------------------------------- */
527
528/* stuff to implement simple "bloom filters" for Unicode characters.
529 to keep things simple, we use a single bitmask, using the least 5
530 bits from each unicode characters as the bit index. */
531
532/* the linebreak mask is set up by Unicode_Init below */
533
Antoine Pitrouf068f942010-01-13 14:19:12 +0000534#if LONG_BIT >= 128
535#define BLOOM_WIDTH 128
536#elif LONG_BIT >= 64
537#define BLOOM_WIDTH 64
538#elif LONG_BIT >= 32
539#define BLOOM_WIDTH 32
540#else
541#error "LONG_BIT is smaller than 32"
542#endif
543
Thomas Wouters477c8d52006-05-27 19:21:47 +0000544#define BLOOM_MASK unsigned long
545
Serhiy Storchaka05997252013-01-26 12:14:02 +0200546static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000547
Antoine Pitrouf068f942010-01-13 14:19:12 +0000548#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000549
Benjamin Peterson29060642009-01-31 22:14:21 +0000550#define BLOOM_LINEBREAK(ch) \
551 ((ch) < 128U ? ascii_linebreak[(ch)] : \
552 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000553
Alexander Belopolsky40018472011-02-26 01:02:56 +0000554Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200555make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000556{
Victor Stinnera85af502013-04-09 21:53:54 +0200557#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
558 do { \
559 TYPE *data = (TYPE *)PTR; \
560 TYPE *end = data + LEN; \
561 Py_UCS4 ch; \
562 for (; data != end; data++) { \
563 ch = *data; \
564 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
565 } \
566 break; \
567 } while (0)
568
Thomas Wouters477c8d52006-05-27 19:21:47 +0000569 /* calculate simple bloom-style bitmask for a given unicode string */
570
Antoine Pitrouf068f942010-01-13 14:19:12 +0000571 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000572
573 mask = 0;
Victor Stinnera85af502013-04-09 21:53:54 +0200574 switch (kind) {
575 case PyUnicode_1BYTE_KIND:
576 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
577 break;
578 case PyUnicode_2BYTE_KIND:
579 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
580 break;
581 case PyUnicode_4BYTE_KIND:
582 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
583 break;
584 default:
585 assert(0);
586 }
Thomas Wouters477c8d52006-05-27 19:21:47 +0000587 return mask;
Victor Stinnera85af502013-04-09 21:53:54 +0200588
589#undef BLOOM_UPDATE
Thomas Wouters477c8d52006-05-27 19:21:47 +0000590}
591
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200592/* Compilation of templated routines */
593
594#include "stringlib/asciilib.h"
595#include "stringlib/fastsearch.h"
596#include "stringlib/partition.h"
597#include "stringlib/split.h"
598#include "stringlib/count.h"
599#include "stringlib/find.h"
600#include "stringlib/find_max_char.h"
601#include "stringlib/localeutil.h"
602#include "stringlib/undef.h"
603
604#include "stringlib/ucs1lib.h"
605#include "stringlib/fastsearch.h"
606#include "stringlib/partition.h"
607#include "stringlib/split.h"
608#include "stringlib/count.h"
609#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300610#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200611#include "stringlib/find_max_char.h"
612#include "stringlib/localeutil.h"
613#include "stringlib/undef.h"
614
615#include "stringlib/ucs2lib.h"
616#include "stringlib/fastsearch.h"
617#include "stringlib/partition.h"
618#include "stringlib/split.h"
619#include "stringlib/count.h"
620#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300621#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200622#include "stringlib/find_max_char.h"
623#include "stringlib/localeutil.h"
624#include "stringlib/undef.h"
625
626#include "stringlib/ucs4lib.h"
627#include "stringlib/fastsearch.h"
628#include "stringlib/partition.h"
629#include "stringlib/split.h"
630#include "stringlib/count.h"
631#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300632#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200633#include "stringlib/find_max_char.h"
634#include "stringlib/localeutil.h"
635#include "stringlib/undef.h"
636
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200637#include "stringlib/unicodedefs.h"
638#include "stringlib/fastsearch.h"
639#include "stringlib/count.h"
640#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100641#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200642
Guido van Rossumd57fd912000-03-10 22:53:23 +0000643/* --- Unicode Object ----------------------------------------------------- */
644
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200645static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200646fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200647
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200648Py_LOCAL_INLINE(Py_ssize_t) findchar(void *s, int kind,
649 Py_ssize_t size, Py_UCS4 ch,
650 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200651{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200652 int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH;
653
654 switch (kind) {
655 case PyUnicode_1BYTE_KIND:
656 {
657 Py_UCS1 ch1 = (Py_UCS1) ch;
658 if (ch1 == ch)
659 return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode);
660 else
661 return -1;
662 }
663 case PyUnicode_2BYTE_KIND:
664 {
665 Py_UCS2 ch2 = (Py_UCS2) ch;
666 if (ch2 == ch)
667 return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode);
668 else
669 return -1;
670 }
671 case PyUnicode_4BYTE_KIND:
672 return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode);
673 default:
674 assert(0);
675 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200676 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200677}
678
Victor Stinnerafffce42012-10-03 23:03:17 +0200679#ifdef Py_DEBUG
680/* Fill the data of an Unicode string with invalid characters to detect bugs
681 earlier.
682
683 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
684 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
685 invalid character in Unicode 6.0. */
686static void
687unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
688{
689 int kind = PyUnicode_KIND(unicode);
690 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
691 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
692 if (length <= old_length)
693 return;
694 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
695}
696#endif
697
Victor Stinnerfe226c02011-10-03 03:52:20 +0200698static PyObject*
699resize_compact(PyObject *unicode, Py_ssize_t length)
700{
701 Py_ssize_t char_size;
702 Py_ssize_t struct_size;
703 Py_ssize_t new_size;
704 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100705 PyObject *new_unicode;
Victor Stinnerafffce42012-10-03 23:03:17 +0200706#ifdef Py_DEBUG
707 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
708#endif
709
Victor Stinner79891572012-05-03 13:43:07 +0200710 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200711 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +0100712 assert(PyUnicode_IS_COMPACT(unicode));
713
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200714 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100715 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +0200716 struct_size = sizeof(PyASCIIObject);
717 else
718 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200719 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200720
Victor Stinnerfe226c02011-10-03 03:52:20 +0200721 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
722 PyErr_NoMemory();
723 return NULL;
724 }
725 new_size = (struct_size + (length + 1) * char_size);
726
Victor Stinner84def372011-12-11 20:04:56 +0100727 _Py_DEC_REFTOTAL;
728 _Py_ForgetReference(unicode);
729
730 new_unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
731 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100732 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200733 PyErr_NoMemory();
734 return NULL;
735 }
Victor Stinner84def372011-12-11 20:04:56 +0100736 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200737 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +0100738
Victor Stinnerfe226c02011-10-03 03:52:20 +0200739 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200740 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200741 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100742 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +0200743 _PyUnicode_WSTR_LENGTH(unicode) = length;
744 }
Victor Stinnerbbbac2e2013-02-07 23:12:46 +0100745 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
746 PyObject_DEL(_PyUnicode_WSTR(unicode));
747 _PyUnicode_WSTR(unicode) = NULL;
748 }
Victor Stinnerafffce42012-10-03 23:03:17 +0200749#ifdef Py_DEBUG
750 unicode_fill_invalid(unicode, old_length);
751#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200752 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
753 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +0200754 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200755 return unicode;
756}
757
Alexander Belopolsky40018472011-02-26 01:02:56 +0000758static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200759resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000760{
Victor Stinner95663112011-10-04 01:03:50 +0200761 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100762 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200763 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200764 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000765
Victor Stinnerfe226c02011-10-03 03:52:20 +0200766 if (PyUnicode_IS_READY(unicode)) {
767 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200768 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200769 void *data;
Victor Stinnerafffce42012-10-03 23:03:17 +0200770#ifdef Py_DEBUG
771 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
772#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200773
774 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200775 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200776 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
777 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200778
779 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
780 PyErr_NoMemory();
781 return -1;
782 }
783 new_size = (length + 1) * char_size;
784
Victor Stinner7a9105a2011-12-12 00:13:42 +0100785 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
786 {
787 PyObject_DEL(_PyUnicode_UTF8(unicode));
788 _PyUnicode_UTF8(unicode) = NULL;
789 _PyUnicode_UTF8_LENGTH(unicode) = 0;
790 }
791
Victor Stinnerfe226c02011-10-03 03:52:20 +0200792 data = (PyObject *)PyObject_REALLOC(data, new_size);
793 if (data == NULL) {
794 PyErr_NoMemory();
795 return -1;
796 }
797 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200798 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200799 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200800 _PyUnicode_WSTR_LENGTH(unicode) = length;
801 }
802 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200803 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200804 _PyUnicode_UTF8_LENGTH(unicode) = length;
805 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200806 _PyUnicode_LENGTH(unicode) = length;
807 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinnerafffce42012-10-03 23:03:17 +0200808#ifdef Py_DEBUG
809 unicode_fill_invalid(unicode, old_length);
810#endif
Victor Stinner95663112011-10-04 01:03:50 +0200811 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200812 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200813 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200814 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200815 }
Victor Stinner95663112011-10-04 01:03:50 +0200816 assert(_PyUnicode_WSTR(unicode) != NULL);
817
818 /* check for integer overflow */
819 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
820 PyErr_NoMemory();
821 return -1;
822 }
Victor Stinner7a9105a2011-12-12 00:13:42 +0100823 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +0200824 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +0100825 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +0200826 if (!wstr) {
827 PyErr_NoMemory();
828 return -1;
829 }
830 _PyUnicode_WSTR(unicode) = wstr;
831 _PyUnicode_WSTR(unicode)[length] = 0;
832 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200833 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000834 return 0;
835}
836
Victor Stinnerfe226c02011-10-03 03:52:20 +0200837static PyObject*
838resize_copy(PyObject *unicode, Py_ssize_t length)
839{
840 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100841 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200842 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100843
Benjamin Petersonbac79492012-01-14 13:34:47 -0500844 if (PyUnicode_READY(unicode) == -1)
Victor Stinner7a9105a2011-12-12 00:13:42 +0100845 return NULL;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200846
847 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
848 if (copy == NULL)
849 return NULL;
850
851 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +0200852 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200853 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200854 }
855 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200856 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100857
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200858 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200859 if (w == NULL)
860 return NULL;
861 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
862 copy_length = Py_MIN(copy_length, length);
Victor Stinnerc6cf1ba2012-10-23 02:54:47 +0200863 Py_MEMCPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
864 copy_length * sizeof(wchar_t));
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200865 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200866 }
867}
868
Guido van Rossumd57fd912000-03-10 22:53:23 +0000869/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000870 Ux0000 terminated; some code (e.g. new_identifier)
871 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000872
873 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000874 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000875
876*/
877
Alexander Belopolsky40018472011-02-26 01:02:56 +0000878static PyUnicodeObject *
879_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000880{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +0200881 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200882 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000883
Thomas Wouters477c8d52006-05-27 19:21:47 +0000884 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000885 if (length == 0 && unicode_empty != NULL) {
886 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200887 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000888 }
889
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000890 /* Ensure we won't overflow the size. */
891 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
892 return (PyUnicodeObject *)PyErr_NoMemory();
893 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200894 if (length < 0) {
895 PyErr_SetString(PyExc_SystemError,
896 "Negative size passed to _PyUnicode_New");
897 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000898 }
899
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200900 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
901 if (unicode == NULL)
902 return NULL;
903 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
Victor Stinner68b674c2013-10-29 19:31:43 +0100904
905 _PyUnicode_WSTR_LENGTH(unicode) = length;
906 _PyUnicode_HASH(unicode) = -1;
907 _PyUnicode_STATE(unicode).interned = 0;
908 _PyUnicode_STATE(unicode).kind = 0;
909 _PyUnicode_STATE(unicode).compact = 0;
910 _PyUnicode_STATE(unicode).ready = 0;
911 _PyUnicode_STATE(unicode).ascii = 0;
912 _PyUnicode_DATA_ANY(unicode) = NULL;
913 _PyUnicode_LENGTH(unicode) = 0;
914 _PyUnicode_UTF8(unicode) = NULL;
915 _PyUnicode_UTF8_LENGTH(unicode) = 0;
916
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200917 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
918 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100919 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +0000920 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100921 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000922 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200923
Jeremy Hyltond8082792003-09-16 19:41:39 +0000924 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000925 * the caller fails before initializing str -- unicode_resize()
926 * reads str[0], and the Keep-Alive optimization can keep memory
927 * allocated for str alive across a call to unicode_dealloc(unicode).
928 * We don't want unicode_resize to read uninitialized memory in
929 * that case.
930 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200931 _PyUnicode_WSTR(unicode)[0] = 0;
932 _PyUnicode_WSTR(unicode)[length] = 0;
Victor Stinner68b674c2013-10-29 19:31:43 +0100933
Victor Stinner7931d9a2011-11-04 00:22:48 +0100934 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000935 return unicode;
936}
937
Victor Stinnerf42dc442011-10-02 23:33:16 +0200938static const char*
939unicode_kind_name(PyObject *unicode)
940{
Victor Stinner42dfd712011-10-03 14:41:45 +0200941 /* don't check consistency: unicode_kind_name() is called from
942 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +0200943 if (!PyUnicode_IS_COMPACT(unicode))
944 {
945 if (!PyUnicode_IS_READY(unicode))
946 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -0600947 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200948 {
949 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200950 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200951 return "legacy ascii";
952 else
953 return "legacy latin1";
954 case PyUnicode_2BYTE_KIND:
955 return "legacy UCS2";
956 case PyUnicode_4BYTE_KIND:
957 return "legacy UCS4";
958 default:
959 return "<legacy invalid kind>";
960 }
961 }
962 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -0600963 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +0200964 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200965 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200966 return "ascii";
967 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200968 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200969 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200970 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200971 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200972 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200973 default:
974 return "<invalid compact kind>";
975 }
976}
977
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200978#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200979/* Functions wrapping macros for use in debugger */
980char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200981 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200982}
983
984void *_PyUnicode_compact_data(void *unicode) {
985 return _PyUnicode_COMPACT_DATA(unicode);
986}
987void *_PyUnicode_data(void *unicode){
988 printf("obj %p\n", unicode);
989 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
990 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
991 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
992 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
993 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
994 return PyUnicode_DATA(unicode);
995}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200996
997void
998_PyUnicode_Dump(PyObject *op)
999{
1000 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +02001001 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1002 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
1003 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +02001004
Victor Stinnera849a4b2011-10-03 12:12:11 +02001005 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +02001006 {
1007 if (ascii->state.ascii)
1008 data = (ascii + 1);
1009 else
1010 data = (compact + 1);
1011 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001012 else
1013 data = unicode->data.any;
Victor Stinner0d60e872011-10-23 19:47:19 +02001014 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
1015
Victor Stinnera849a4b2011-10-03 12:12:11 +02001016 if (ascii->wstr == data)
1017 printf("shared ");
1018 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +02001019
Victor Stinnera3b334d2011-10-03 13:53:37 +02001020 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnera849a4b2011-10-03 12:12:11 +02001021 printf(" (%zu), ", compact->wstr_length);
1022 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
1023 printf("shared ");
1024 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001025 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001026 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001027}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001028#endif
1029
1030PyObject *
1031PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1032{
1033 PyObject *obj;
1034 PyCompactUnicodeObject *unicode;
1035 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +02001036 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001037 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001038 Py_ssize_t char_size;
1039 Py_ssize_t struct_size;
1040
1041 /* Optimization for empty strings */
1042 if (size == 0 && unicode_empty != NULL) {
1043 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001044 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001045 }
1046
Victor Stinner9e9d6892011-10-04 01:02:02 +02001047 is_ascii = 0;
1048 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001049 struct_size = sizeof(PyCompactUnicodeObject);
1050 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +02001051 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001052 char_size = 1;
1053 is_ascii = 1;
1054 struct_size = sizeof(PyASCIIObject);
1055 }
1056 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001057 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001058 char_size = 1;
1059 }
1060 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001061 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001062 char_size = 2;
1063 if (sizeof(wchar_t) == 2)
1064 is_sharing = 1;
1065 }
1066 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001067 if (maxchar > MAX_UNICODE) {
1068 PyErr_SetString(PyExc_SystemError,
1069 "invalid maximum character passed to PyUnicode_New");
1070 return NULL;
1071 }
Victor Stinner8f825062012-04-27 13:55:39 +02001072 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001073 char_size = 4;
1074 if (sizeof(wchar_t) == 4)
1075 is_sharing = 1;
1076 }
1077
1078 /* Ensure we won't overflow the size. */
1079 if (size < 0) {
1080 PyErr_SetString(PyExc_SystemError,
1081 "Negative size passed to PyUnicode_New");
1082 return NULL;
1083 }
1084 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1085 return PyErr_NoMemory();
1086
1087 /* Duplicated allocation code from _PyObject_New() instead of a call to
1088 * PyObject_New() so we are able to allocate space for the object and
1089 * it's data buffer.
1090 */
1091 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1092 if (obj == NULL)
1093 return PyErr_NoMemory();
1094 obj = PyObject_INIT(obj, &PyUnicode_Type);
1095 if (obj == NULL)
1096 return NULL;
1097
1098 unicode = (PyCompactUnicodeObject *)obj;
1099 if (is_ascii)
1100 data = ((PyASCIIObject*)obj) + 1;
1101 else
1102 data = unicode + 1;
1103 _PyUnicode_LENGTH(unicode) = size;
1104 _PyUnicode_HASH(unicode) = -1;
1105 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001106 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001107 _PyUnicode_STATE(unicode).compact = 1;
1108 _PyUnicode_STATE(unicode).ready = 1;
1109 _PyUnicode_STATE(unicode).ascii = is_ascii;
1110 if (is_ascii) {
1111 ((char*)data)[size] = 0;
1112 _PyUnicode_WSTR(unicode) = NULL;
1113 }
Victor Stinner8f825062012-04-27 13:55:39 +02001114 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001115 ((char*)data)[size] = 0;
1116 _PyUnicode_WSTR(unicode) = NULL;
1117 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001118 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001119 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001120 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001121 else {
1122 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001123 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001124 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001125 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001126 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001127 ((Py_UCS4*)data)[size] = 0;
1128 if (is_sharing) {
1129 _PyUnicode_WSTR_LENGTH(unicode) = size;
1130 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1131 }
1132 else {
1133 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1134 _PyUnicode_WSTR(unicode) = NULL;
1135 }
1136 }
Victor Stinner8f825062012-04-27 13:55:39 +02001137#ifdef Py_DEBUG
Victor Stinnerafffce42012-10-03 23:03:17 +02001138 unicode_fill_invalid((PyObject*)unicode, 0);
Victor Stinner8f825062012-04-27 13:55:39 +02001139#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001140 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001141 return obj;
1142}
1143
1144#if SIZEOF_WCHAR_T == 2
1145/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1146 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001147 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001148
1149 This function assumes that unicode can hold one more code point than wstr
1150 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001151static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001152unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001153 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001154{
1155 const wchar_t *iter;
1156 Py_UCS4 *ucs4_out;
1157
Victor Stinner910337b2011-10-03 03:20:16 +02001158 assert(unicode != NULL);
1159 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001160 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1161 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1162
1163 for (iter = begin; iter < end; ) {
1164 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1165 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001166 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1167 && (iter+1) < end
1168 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001169 {
Victor Stinner551ac952011-11-29 22:58:13 +01001170 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001171 iter += 2;
1172 }
1173 else {
1174 *ucs4_out++ = *iter;
1175 iter++;
1176 }
1177 }
1178 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1179 _PyUnicode_GET_LENGTH(unicode)));
1180
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001181}
1182#endif
1183
Victor Stinnercd9950f2011-10-02 00:34:53 +02001184static int
Victor Stinner488fa492011-12-12 00:01:39 +01001185unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001186{
Victor Stinner488fa492011-12-12 00:01:39 +01001187 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001188 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001189 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001190 return -1;
1191 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001192 return 0;
1193}
1194
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001195static int
1196_copy_characters(PyObject *to, Py_ssize_t to_start,
1197 PyObject *from, Py_ssize_t from_start,
1198 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001199{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001200 unsigned int from_kind, to_kind;
1201 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001202
Victor Stinneree4544c2012-05-09 22:24:08 +02001203 assert(0 <= how_many);
1204 assert(0 <= from_start);
1205 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001206 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001207 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001208 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001209
Victor Stinnerd3f08822012-05-29 12:57:52 +02001210 assert(PyUnicode_Check(to));
1211 assert(PyUnicode_IS_READY(to));
1212 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1213
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001214 if (how_many == 0)
1215 return 0;
1216
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001217 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001218 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001219 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001220 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001221
Victor Stinnerf1852262012-06-16 16:38:26 +02001222#ifdef Py_DEBUG
1223 if (!check_maxchar
1224 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1225 {
1226 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1227 Py_UCS4 ch;
1228 Py_ssize_t i;
1229 for (i=0; i < how_many; i++) {
1230 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1231 assert(ch <= to_maxchar);
1232 }
1233 }
1234#endif
1235
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001236 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001237 if (check_maxchar
1238 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1239 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001240 /* Writing Latin-1 characters into an ASCII string requires to
1241 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001242 Py_UCS4 max_char;
1243 max_char = ucs1lib_find_max_char(from_data,
1244 (Py_UCS1*)from_data + how_many);
1245 if (max_char >= 128)
1246 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001247 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001248 Py_MEMCPY((char*)to_data + to_kind * to_start,
1249 (char*)from_data + from_kind * from_start,
1250 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001251 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001252 else if (from_kind == PyUnicode_1BYTE_KIND
1253 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001254 {
1255 _PyUnicode_CONVERT_BYTES(
1256 Py_UCS1, Py_UCS2,
1257 PyUnicode_1BYTE_DATA(from) + from_start,
1258 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1259 PyUnicode_2BYTE_DATA(to) + to_start
1260 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001261 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001262 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001263 && to_kind == PyUnicode_4BYTE_KIND)
1264 {
1265 _PyUnicode_CONVERT_BYTES(
1266 Py_UCS1, Py_UCS4,
1267 PyUnicode_1BYTE_DATA(from) + from_start,
1268 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1269 PyUnicode_4BYTE_DATA(to) + to_start
1270 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001271 }
1272 else if (from_kind == PyUnicode_2BYTE_KIND
1273 && to_kind == PyUnicode_4BYTE_KIND)
1274 {
1275 _PyUnicode_CONVERT_BYTES(
1276 Py_UCS2, Py_UCS4,
1277 PyUnicode_2BYTE_DATA(from) + from_start,
1278 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1279 PyUnicode_4BYTE_DATA(to) + to_start
1280 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001281 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001282 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001283 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1284
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001285 if (!check_maxchar) {
1286 if (from_kind == PyUnicode_2BYTE_KIND
1287 && to_kind == PyUnicode_1BYTE_KIND)
1288 {
1289 _PyUnicode_CONVERT_BYTES(
1290 Py_UCS2, Py_UCS1,
1291 PyUnicode_2BYTE_DATA(from) + from_start,
1292 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1293 PyUnicode_1BYTE_DATA(to) + to_start
1294 );
1295 }
1296 else if (from_kind == PyUnicode_4BYTE_KIND
1297 && to_kind == PyUnicode_1BYTE_KIND)
1298 {
1299 _PyUnicode_CONVERT_BYTES(
1300 Py_UCS4, Py_UCS1,
1301 PyUnicode_4BYTE_DATA(from) + from_start,
1302 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1303 PyUnicode_1BYTE_DATA(to) + to_start
1304 );
1305 }
1306 else if (from_kind == PyUnicode_4BYTE_KIND
1307 && to_kind == PyUnicode_2BYTE_KIND)
1308 {
1309 _PyUnicode_CONVERT_BYTES(
1310 Py_UCS4, Py_UCS2,
1311 PyUnicode_4BYTE_DATA(from) + from_start,
1312 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1313 PyUnicode_2BYTE_DATA(to) + to_start
1314 );
1315 }
1316 else {
1317 assert(0);
1318 return -1;
1319 }
1320 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001321 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001322 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001323 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001324 Py_ssize_t i;
1325
Victor Stinnera0702ab2011-09-29 14:14:38 +02001326 for (i=0; i < how_many; i++) {
1327 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001328 if (ch > to_maxchar)
1329 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001330 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1331 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001332 }
1333 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001334 return 0;
1335}
1336
Victor Stinnerd3f08822012-05-29 12:57:52 +02001337void
1338_PyUnicode_FastCopyCharacters(
1339 PyObject *to, Py_ssize_t to_start,
1340 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001341{
1342 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1343}
1344
1345Py_ssize_t
1346PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1347 PyObject *from, Py_ssize_t from_start,
1348 Py_ssize_t how_many)
1349{
1350 int err;
1351
1352 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1353 PyErr_BadInternalCall();
1354 return -1;
1355 }
1356
Benjamin Petersonbac79492012-01-14 13:34:47 -05001357 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001358 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001359 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001360 return -1;
1361
Victor Stinnerd3f08822012-05-29 12:57:52 +02001362 if (from_start < 0) {
1363 PyErr_SetString(PyExc_IndexError, "string index out of range");
1364 return -1;
1365 }
1366 if (to_start < 0) {
1367 PyErr_SetString(PyExc_IndexError, "string index out of range");
1368 return -1;
1369 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001370 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1371 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1372 PyErr_Format(PyExc_SystemError,
1373 "Cannot write %zi characters at %zi "
1374 "in a string of %zi characters",
1375 how_many, to_start, PyUnicode_GET_LENGTH(to));
1376 return -1;
1377 }
1378
1379 if (how_many == 0)
1380 return 0;
1381
Victor Stinner488fa492011-12-12 00:01:39 +01001382 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001383 return -1;
1384
1385 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1386 if (err) {
1387 PyErr_Format(PyExc_SystemError,
1388 "Cannot copy %s characters "
1389 "into a string of %s characters",
1390 unicode_kind_name(from),
1391 unicode_kind_name(to));
1392 return -1;
1393 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001394 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001395}
1396
Victor Stinner17222162011-09-28 22:15:37 +02001397/* Find the maximum code point and count the number of surrogate pairs so a
1398 correct string length can be computed before converting a string to UCS4.
1399 This function counts single surrogates as a character and not as a pair.
1400
1401 Return 0 on success, or -1 on error. */
1402static int
1403find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1404 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001405{
1406 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001407 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001408
Victor Stinnerc53be962011-10-02 21:33:54 +02001409 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001410 *num_surrogates = 0;
1411 *maxchar = 0;
1412
1413 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001414#if SIZEOF_WCHAR_T == 2
Victor Stinnercf77da92013-03-06 01:09:24 +01001415 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1416 && (iter+1) < end
1417 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1418 {
1419 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1420 ++(*num_surrogates);
1421 iter += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001422 }
1423 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001424#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001425 {
1426 ch = *iter;
1427 iter++;
1428 }
1429 if (ch > *maxchar) {
1430 *maxchar = ch;
1431 if (*maxchar > MAX_UNICODE) {
1432 PyErr_Format(PyExc_ValueError,
1433 "character U+%x is not in range [U+0000; U+10ffff]",
1434 ch);
1435 return -1;
1436 }
1437 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001438 }
1439 return 0;
1440}
1441
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001442int
1443_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001444{
1445 wchar_t *end;
1446 Py_UCS4 maxchar = 0;
1447 Py_ssize_t num_surrogates;
1448#if SIZEOF_WCHAR_T == 2
1449 Py_ssize_t length_wo_surrogates;
1450#endif
1451
Georg Brandl7597add2011-10-05 16:36:47 +02001452 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001453 strings were created using _PyObject_New() and where no canonical
1454 representation (the str field) has been set yet aka strings
1455 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001456 assert(_PyUnicode_CHECK(unicode));
1457 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001458 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001459 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001460 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001461 /* Actually, it should neither be interned nor be anything else: */
1462 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001463
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001464 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001465 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001466 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001467 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001468
1469 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001470 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1471 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001472 PyErr_NoMemory();
1473 return -1;
1474 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001475 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001476 _PyUnicode_WSTR(unicode), end,
1477 PyUnicode_1BYTE_DATA(unicode));
1478 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1479 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1480 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1481 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001482 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001483 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001484 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001485 }
1486 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001487 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001488 _PyUnicode_UTF8(unicode) = NULL;
1489 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001490 }
1491 PyObject_FREE(_PyUnicode_WSTR(unicode));
1492 _PyUnicode_WSTR(unicode) = NULL;
1493 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1494 }
1495 /* In this case we might have to convert down from 4-byte native
1496 wchar_t to 2-byte unicode. */
1497 else if (maxchar < 65536) {
1498 assert(num_surrogates == 0 &&
1499 "FindMaxCharAndNumSurrogatePairs() messed up");
1500
Victor Stinner506f5922011-09-28 22:34:18 +02001501#if SIZEOF_WCHAR_T == 2
1502 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001503 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001504 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1505 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1506 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001507 _PyUnicode_UTF8(unicode) = NULL;
1508 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001509#else
1510 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001511 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001512 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001513 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001514 PyErr_NoMemory();
1515 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001516 }
Victor Stinner506f5922011-09-28 22:34:18 +02001517 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1518 _PyUnicode_WSTR(unicode), end,
1519 PyUnicode_2BYTE_DATA(unicode));
1520 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1521 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1522 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001523 _PyUnicode_UTF8(unicode) = NULL;
1524 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001525 PyObject_FREE(_PyUnicode_WSTR(unicode));
1526 _PyUnicode_WSTR(unicode) = NULL;
1527 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1528#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001529 }
1530 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1531 else {
1532#if SIZEOF_WCHAR_T == 2
1533 /* in case the native representation is 2-bytes, we need to allocate a
1534 new normalized 4-byte version. */
1535 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001536 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1537 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001538 PyErr_NoMemory();
1539 return -1;
1540 }
1541 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1542 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001543 _PyUnicode_UTF8(unicode) = NULL;
1544 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001545 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1546 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001547 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001548 PyObject_FREE(_PyUnicode_WSTR(unicode));
1549 _PyUnicode_WSTR(unicode) = NULL;
1550 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1551#else
1552 assert(num_surrogates == 0);
1553
Victor Stinnerc3c74152011-10-02 20:39:55 +02001554 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001555 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001556 _PyUnicode_UTF8(unicode) = NULL;
1557 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001558 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1559#endif
1560 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1561 }
1562 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001563 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001564 return 0;
1565}
1566
Alexander Belopolsky40018472011-02-26 01:02:56 +00001567static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001568unicode_dealloc(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001569{
Walter Dörwald16807132007-05-25 13:52:07 +00001570 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001571 case SSTATE_NOT_INTERNED:
1572 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001573
Benjamin Peterson29060642009-01-31 22:14:21 +00001574 case SSTATE_INTERNED_MORTAL:
1575 /* revive dead object temporarily for DelItem */
1576 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001577 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001578 Py_FatalError(
1579 "deletion of interned string failed");
1580 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001581
Benjamin Peterson29060642009-01-31 22:14:21 +00001582 case SSTATE_INTERNED_IMMORTAL:
1583 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001584
Benjamin Peterson29060642009-01-31 22:14:21 +00001585 default:
1586 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001587 }
1588
Victor Stinner03490912011-10-03 23:45:12 +02001589 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001590 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001591 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001592 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001593 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1594 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001595
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001596 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001597}
1598
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001599#ifdef Py_DEBUG
1600static int
1601unicode_is_singleton(PyObject *unicode)
1602{
1603 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1604 if (unicode == unicode_empty)
1605 return 1;
1606 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1607 {
1608 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1609 if (ch < 256 && unicode_latin1[ch] == unicode)
1610 return 1;
1611 }
1612 return 0;
1613}
1614#endif
1615
Alexander Belopolsky40018472011-02-26 01:02:56 +00001616static int
Victor Stinner488fa492011-12-12 00:01:39 +01001617unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001618{
Victor Stinner488fa492011-12-12 00:01:39 +01001619 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001620 if (Py_REFCNT(unicode) != 1)
1621 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001622 if (_PyUnicode_HASH(unicode) != -1)
1623 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001624 if (PyUnicode_CHECK_INTERNED(unicode))
1625 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001626 if (!PyUnicode_CheckExact(unicode))
1627 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001628#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001629 /* singleton refcount is greater than 1 */
1630 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001631#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001632 return 1;
1633}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001634
Victor Stinnerfe226c02011-10-03 03:52:20 +02001635static int
1636unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1637{
1638 PyObject *unicode;
1639 Py_ssize_t old_length;
1640
1641 assert(p_unicode != NULL);
1642 unicode = *p_unicode;
1643
1644 assert(unicode != NULL);
1645 assert(PyUnicode_Check(unicode));
1646 assert(0 <= length);
1647
Victor Stinner910337b2011-10-03 03:20:16 +02001648 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001649 old_length = PyUnicode_WSTR_LENGTH(unicode);
1650 else
1651 old_length = PyUnicode_GET_LENGTH(unicode);
1652 if (old_length == length)
1653 return 0;
1654
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001655 if (length == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02001656 _Py_INCREF_UNICODE_EMPTY();
1657 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00001658 return -1;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001659 Py_DECREF(*p_unicode);
1660 *p_unicode = unicode_empty;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001661 return 0;
1662 }
1663
Victor Stinner488fa492011-12-12 00:01:39 +01001664 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001665 PyObject *copy = resize_copy(unicode, length);
1666 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001667 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001668 Py_DECREF(*p_unicode);
1669 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001670 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001671 }
1672
Victor Stinnerfe226c02011-10-03 03:52:20 +02001673 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001674 PyObject *new_unicode = resize_compact(unicode, length);
1675 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001676 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001677 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001678 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001679 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001680 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001681}
1682
Alexander Belopolsky40018472011-02-26 01:02:56 +00001683int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001684PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001685{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001686 PyObject *unicode;
1687 if (p_unicode == NULL) {
1688 PyErr_BadInternalCall();
1689 return -1;
1690 }
1691 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001692 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001693 {
1694 PyErr_BadInternalCall();
1695 return -1;
1696 }
1697 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001698}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001699
Victor Stinnerc5166102012-02-22 13:55:02 +01001700/* Copy a ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01001701
Victor Stinnerb429d3b2012-02-22 21:22:20 +01001702 WARNING: The function doesn't copy the terminating null character and
1703 doesn't check the maximum character (may write a latin1 character in an
1704 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02001705static void
1706unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1707 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01001708{
1709 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1710 void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02001711 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01001712
1713 switch (kind) {
1714 case PyUnicode_1BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01001715 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner8c6db452012-10-06 00:40:45 +02001716#ifdef Py_DEBUG
1717 if (PyUnicode_IS_ASCII(unicode)) {
1718 Py_UCS4 maxchar = ucs1lib_find_max_char(
1719 (const Py_UCS1*)str,
1720 (const Py_UCS1*)str + len);
1721 assert(maxchar < 128);
1722 }
1723#endif
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01001724 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02001725 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001726 }
1727 case PyUnicode_2BYTE_KIND: {
1728 Py_UCS2 *start = (Py_UCS2 *)data + index;
1729 Py_UCS2 *ucs2 = start;
1730 assert(index <= PyUnicode_GET_LENGTH(unicode));
1731
Victor Stinner184252a2012-06-16 02:57:41 +02001732 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001733 *ucs2 = (Py_UCS2)*str;
1734
1735 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02001736 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001737 }
1738 default: {
1739 Py_UCS4 *start = (Py_UCS4 *)data + index;
1740 Py_UCS4 *ucs4 = start;
1741 assert(kind == PyUnicode_4BYTE_KIND);
1742 assert(index <= PyUnicode_GET_LENGTH(unicode));
1743
Victor Stinner184252a2012-06-16 02:57:41 +02001744 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001745 *ucs4 = (Py_UCS4)*str;
1746
1747 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01001748 }
1749 }
1750}
1751
1752
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001753static PyObject*
1754get_latin1_char(unsigned char ch)
1755{
Victor Stinnera464fc12011-10-02 20:39:30 +02001756 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001757 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001758 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001759 if (!unicode)
1760 return NULL;
1761 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001762 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001763 unicode_latin1[ch] = unicode;
1764 }
1765 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001766 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001767}
1768
Alexander Belopolsky40018472011-02-26 01:02:56 +00001769PyObject *
1770PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001771{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001772 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001773 Py_UCS4 maxchar = 0;
1774 Py_ssize_t num_surrogates;
1775
1776 if (u == NULL)
1777 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001778
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001779 /* If the Unicode data is known at construction time, we can apply
1780 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001781
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001782 /* Optimization for empty strings */
Serhiy Storchaka678db842013-01-26 12:16:36 +02001783 if (size == 0)
1784 _Py_RETURN_UNICODE_EMPTY();
Tim Petersced69f82003-09-16 20:30:58 +00001785
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001786 /* Single character Unicode objects in the Latin-1 range are
1787 shared when using this constructor */
Victor Stinnerd21b58c2013-02-26 00:15:54 +01001788 if (size == 1 && (Py_UCS4)*u < 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001789 return get_latin1_char((unsigned char)*u);
1790
1791 /* If not empty and not single character, copy the Unicode data
1792 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001793 if (find_maxchar_surrogates(u, u + size,
1794 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001795 return NULL;
1796
Victor Stinner8faf8212011-12-08 22:14:11 +01001797 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001798 if (!unicode)
1799 return NULL;
1800
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001801 switch (PyUnicode_KIND(unicode)) {
1802 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001803 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001804 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1805 break;
1806 case PyUnicode_2BYTE_KIND:
1807#if Py_UNICODE_SIZE == 2
1808 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1809#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001810 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001811 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1812#endif
1813 break;
1814 case PyUnicode_4BYTE_KIND:
1815#if SIZEOF_WCHAR_T == 2
1816 /* This is the only case which has to process surrogates, thus
1817 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001818 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001819#else
1820 assert(num_surrogates == 0);
1821 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1822#endif
1823 break;
1824 default:
1825 assert(0 && "Impossible state");
1826 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001827
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001828 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001829}
1830
Alexander Belopolsky40018472011-02-26 01:02:56 +00001831PyObject *
1832PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001833{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001834 if (size < 0) {
1835 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001836 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001837 return NULL;
1838 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001839 if (u != NULL)
1840 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
1841 else
1842 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001843}
1844
Alexander Belopolsky40018472011-02-26 01:02:56 +00001845PyObject *
1846PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001847{
1848 size_t size = strlen(u);
1849 if (size > PY_SSIZE_T_MAX) {
1850 PyErr_SetString(PyExc_OverflowError, "input too long");
1851 return NULL;
1852 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001853 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001854}
1855
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001856PyObject *
1857_PyUnicode_FromId(_Py_Identifier *id)
1858{
1859 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01001860 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
1861 strlen(id->string),
1862 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001863 if (!id->object)
1864 return NULL;
1865 PyUnicode_InternInPlace(&id->object);
1866 assert(!id->next);
1867 id->next = static_strings;
1868 static_strings = id;
1869 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001870 return id->object;
1871}
1872
1873void
1874_PyUnicode_ClearStaticStrings()
1875{
Benjamin Peterson0c270a82013-01-09 09:52:01 -06001876 _Py_Identifier *tmp, *s = static_strings;
1877 while (s) {
1878 Py_DECREF(s->object);
1879 s->object = NULL;
1880 tmp = s->next;
1881 s->next = NULL;
1882 s = tmp;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001883 }
Benjamin Peterson0c270a82013-01-09 09:52:01 -06001884 static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001885}
1886
Benjamin Peterson0df54292012-03-26 14:50:32 -04001887/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001888
Victor Stinnerd3f08822012-05-29 12:57:52 +02001889PyObject*
1890_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02001891{
Victor Stinnerd3f08822012-05-29 12:57:52 +02001892 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01001893 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01001894 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02001895#ifdef Py_DEBUG
Victor Stinnerd21b58c2013-02-26 00:15:54 +01001896 assert((unsigned char)s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02001897#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001898 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01001899 }
Victor Stinner785938e2011-12-11 20:09:03 +01001900 unicode = PyUnicode_New(size, 127);
1901 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02001902 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01001903 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
1904 assert(_PyUnicode_CheckConsistency(unicode, 1));
1905 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02001906}
1907
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001908static Py_UCS4
1909kind_maxchar_limit(unsigned int kind)
1910{
Benjamin Petersonead6b532011-12-20 17:23:42 -06001911 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001912 case PyUnicode_1BYTE_KIND:
1913 return 0x80;
1914 case PyUnicode_2BYTE_KIND:
1915 return 0x100;
1916 case PyUnicode_4BYTE_KIND:
1917 return 0x10000;
1918 default:
1919 assert(0 && "invalid kind");
Victor Stinner8faf8212011-12-08 22:14:11 +01001920 return MAX_UNICODE;
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001921 }
1922}
1923
Victor Stinnere6abb482012-05-02 01:15:40 +02001924Py_LOCAL_INLINE(Py_UCS4)
1925align_maxchar(Py_UCS4 maxchar)
1926{
1927 if (maxchar <= 127)
1928 return 127;
1929 else if (maxchar <= 255)
1930 return 255;
1931 else if (maxchar <= 65535)
1932 return 65535;
1933 else
1934 return MAX_UNICODE;
1935}
1936
Victor Stinner702c7342011-10-05 13:50:52 +02001937static PyObject*
Victor Stinnerd21b58c2013-02-26 00:15:54 +01001938_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001939{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001940 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001941 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001942
Serhiy Storchaka678db842013-01-26 12:16:36 +02001943 if (size == 0)
1944 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001945 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001946 if (size == 1)
1947 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001948
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001949 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001950 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001951 if (!res)
1952 return NULL;
1953 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001954 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001955 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001956}
1957
Victor Stinnere57b1c02011-09-28 22:20:48 +02001958static PyObject*
1959_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001960{
1961 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001962 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001963
Serhiy Storchaka678db842013-01-26 12:16:36 +02001964 if (size == 0)
1965 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001966 assert(size > 0);
Victor Stinnerb6cd0142012-05-03 02:17:04 +02001967 if (size == 1) {
1968 Py_UCS4 ch = u[0];
Victor Stinnerf50a4e92013-04-09 22:38:52 +02001969 int kind;
1970 void *data;
Victor Stinnerb6cd0142012-05-03 02:17:04 +02001971 if (ch < 256)
1972 return get_latin1_char((unsigned char)ch);
1973
1974 res = PyUnicode_New(1, ch);
1975 if (res == NULL)
1976 return NULL;
Victor Stinnerf50a4e92013-04-09 22:38:52 +02001977 kind = PyUnicode_KIND(res);
1978 data = PyUnicode_DATA(res);
1979 PyUnicode_WRITE(kind, data, 0, ch);
Victor Stinnerb6cd0142012-05-03 02:17:04 +02001980 assert(_PyUnicode_CheckConsistency(res, 1));
1981 return res;
1982 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001983
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001984 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001985 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001986 if (!res)
1987 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001988 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001989 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001990 else {
1991 _PyUnicode_CONVERT_BYTES(
1992 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
1993 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001994 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001995 return res;
1996}
1997
Victor Stinnere57b1c02011-09-28 22:20:48 +02001998static PyObject*
1999_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002000{
2001 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002002 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002003
Serhiy Storchaka678db842013-01-26 12:16:36 +02002004 if (size == 0)
2005 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002006 assert(size > 0);
Victor Stinnerb6cd0142012-05-03 02:17:04 +02002007 if (size == 1) {
2008 Py_UCS4 ch = u[0];
Victor Stinnerf50a4e92013-04-09 22:38:52 +02002009 int kind;
2010 void *data;
Victor Stinnerb6cd0142012-05-03 02:17:04 +02002011 if (ch < 256)
2012 return get_latin1_char((unsigned char)ch);
2013
2014 res = PyUnicode_New(1, ch);
2015 if (res == NULL)
2016 return NULL;
Victor Stinnerf50a4e92013-04-09 22:38:52 +02002017 kind = PyUnicode_KIND(res);
2018 data = PyUnicode_DATA(res);
2019 PyUnicode_WRITE(kind, data, 0, ch);
Victor Stinnerb6cd0142012-05-03 02:17:04 +02002020 assert(_PyUnicode_CheckConsistency(res, 1));
2021 return res;
2022 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002023
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002024 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002025 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002026 if (!res)
2027 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002028 if (max_char < 256)
2029 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2030 PyUnicode_1BYTE_DATA(res));
2031 else if (max_char < 0x10000)
2032 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2033 PyUnicode_2BYTE_DATA(res));
2034 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002035 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002036 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002037 return res;
2038}
2039
2040PyObject*
2041PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2042{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002043 if (size < 0) {
2044 PyErr_SetString(PyExc_ValueError, "size must be positive");
2045 return NULL;
2046 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002047 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002048 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002049 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002050 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002051 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002052 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002053 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002054 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002055 PyErr_SetString(PyExc_SystemError, "invalid kind");
2056 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002057 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002058}
2059
Victor Stinnerece58de2012-04-23 23:36:38 +02002060Py_UCS4
2061_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2062{
2063 enum PyUnicode_Kind kind;
2064 void *startptr, *endptr;
2065
2066 assert(PyUnicode_IS_READY(unicode));
2067 assert(0 <= start);
2068 assert(end <= PyUnicode_GET_LENGTH(unicode));
2069 assert(start <= end);
2070
2071 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2072 return PyUnicode_MAX_CHAR_VALUE(unicode);
2073
2074 if (start == end)
2075 return 127;
2076
Victor Stinner94d558b2012-04-27 22:26:58 +02002077 if (PyUnicode_IS_ASCII(unicode))
2078 return 127;
2079
Victor Stinnerece58de2012-04-23 23:36:38 +02002080 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002081 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002082 endptr = (char *)startptr + end * kind;
2083 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002084 switch(kind) {
2085 case PyUnicode_1BYTE_KIND:
2086 return ucs1lib_find_max_char(startptr, endptr);
2087 case PyUnicode_2BYTE_KIND:
2088 return ucs2lib_find_max_char(startptr, endptr);
2089 case PyUnicode_4BYTE_KIND:
2090 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002091 default:
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002092 assert(0);
2093 return 0;
Victor Stinnerece58de2012-04-23 23:36:38 +02002094 }
2095}
2096
Victor Stinner25a4b292011-10-06 12:31:55 +02002097/* Ensure that a string uses the most efficient storage, if it is not the
2098 case: create a new string with of the right kind. Write NULL into *p_unicode
2099 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002100static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002101unicode_adjust_maxchar(PyObject **p_unicode)
2102{
2103 PyObject *unicode, *copy;
2104 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002105 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002106 unsigned int kind;
2107
2108 assert(p_unicode != NULL);
2109 unicode = *p_unicode;
2110 assert(PyUnicode_IS_READY(unicode));
2111 if (PyUnicode_IS_ASCII(unicode))
2112 return;
2113
2114 len = PyUnicode_GET_LENGTH(unicode);
2115 kind = PyUnicode_KIND(unicode);
2116 if (kind == PyUnicode_1BYTE_KIND) {
2117 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002118 max_char = ucs1lib_find_max_char(u, u + len);
2119 if (max_char >= 128)
2120 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002121 }
2122 else if (kind == PyUnicode_2BYTE_KIND) {
2123 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002124 max_char = ucs2lib_find_max_char(u, u + len);
2125 if (max_char >= 256)
2126 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002127 }
2128 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002129 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02002130 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002131 max_char = ucs4lib_find_max_char(u, u + len);
2132 if (max_char >= 0x10000)
2133 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002134 }
Victor Stinner25a4b292011-10-06 12:31:55 +02002135 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002136 if (copy != NULL)
2137 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002138 Py_DECREF(unicode);
2139 *p_unicode = copy;
2140}
2141
Victor Stinner034f6cf2011-09-30 02:26:44 +02002142PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002143_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002144{
Victor Stinner87af4f22011-11-21 23:03:47 +01002145 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002146 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002147
Victor Stinner034f6cf2011-09-30 02:26:44 +02002148 if (!PyUnicode_Check(unicode)) {
2149 PyErr_BadInternalCall();
2150 return NULL;
2151 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002152 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002153 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002154
Victor Stinner87af4f22011-11-21 23:03:47 +01002155 length = PyUnicode_GET_LENGTH(unicode);
2156 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002157 if (!copy)
2158 return NULL;
2159 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2160
Victor Stinner87af4f22011-11-21 23:03:47 +01002161 Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2162 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002163 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002164 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002165}
2166
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002167
Victor Stinnerbc603d12011-10-02 01:00:40 +02002168/* Widen Unicode objects to larger buffers. Don't write terminating null
2169 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002170
2171void*
2172_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2173{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002174 Py_ssize_t len;
2175 void *result;
2176 unsigned int skind;
2177
Benjamin Petersonbac79492012-01-14 13:34:47 -05002178 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02002179 return NULL;
2180
2181 len = PyUnicode_GET_LENGTH(s);
2182 skind = PyUnicode_KIND(s);
2183 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002184 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002185 return NULL;
2186 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002187 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002188 case PyUnicode_2BYTE_KIND:
2189 result = PyMem_Malloc(len * sizeof(Py_UCS2));
2190 if (!result)
2191 return PyErr_NoMemory();
2192 assert(skind == PyUnicode_1BYTE_KIND);
2193 _PyUnicode_CONVERT_BYTES(
2194 Py_UCS1, Py_UCS2,
2195 PyUnicode_1BYTE_DATA(s),
2196 PyUnicode_1BYTE_DATA(s) + len,
2197 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002198 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002199 case PyUnicode_4BYTE_KIND:
2200 result = PyMem_Malloc(len * sizeof(Py_UCS4));
2201 if (!result)
2202 return PyErr_NoMemory();
2203 if (skind == PyUnicode_2BYTE_KIND) {
2204 _PyUnicode_CONVERT_BYTES(
2205 Py_UCS2, Py_UCS4,
2206 PyUnicode_2BYTE_DATA(s),
2207 PyUnicode_2BYTE_DATA(s) + len,
2208 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002209 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002210 else {
2211 assert(skind == PyUnicode_1BYTE_KIND);
2212 _PyUnicode_CONVERT_BYTES(
2213 Py_UCS1, Py_UCS4,
2214 PyUnicode_1BYTE_DATA(s),
2215 PyUnicode_1BYTE_DATA(s) + len,
2216 result);
2217 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002218 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002219 default:
2220 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002221 }
Victor Stinner01698042011-10-04 00:04:26 +02002222 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002223 return NULL;
2224}
2225
2226static Py_UCS4*
2227as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2228 int copy_null)
2229{
2230 int kind;
2231 void *data;
2232 Py_ssize_t len, targetlen;
2233 if (PyUnicode_READY(string) == -1)
2234 return NULL;
2235 kind = PyUnicode_KIND(string);
2236 data = PyUnicode_DATA(string);
2237 len = PyUnicode_GET_LENGTH(string);
2238 targetlen = len;
2239 if (copy_null)
2240 targetlen++;
2241 if (!target) {
2242 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
2243 PyErr_NoMemory();
2244 return NULL;
2245 }
2246 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
2247 if (!target) {
2248 PyErr_NoMemory();
2249 return NULL;
2250 }
2251 }
2252 else {
2253 if (targetsize < targetlen) {
2254 PyErr_Format(PyExc_SystemError,
2255 "string is longer than the buffer");
2256 if (copy_null && 0 < targetsize)
2257 target[0] = 0;
2258 return NULL;
2259 }
2260 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002261 if (kind == PyUnicode_1BYTE_KIND) {
2262 Py_UCS1 *start = (Py_UCS1 *) data;
2263 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002264 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002265 else if (kind == PyUnicode_2BYTE_KIND) {
2266 Py_UCS2 *start = (Py_UCS2 *) data;
2267 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2268 }
2269 else {
2270 assert(kind == PyUnicode_4BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002271 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002272 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002273 if (copy_null)
2274 target[len] = 0;
2275 return target;
2276}
2277
2278Py_UCS4*
2279PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2280 int copy_null)
2281{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002282 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002283 PyErr_BadInternalCall();
2284 return NULL;
2285 }
2286 return as_ucs4(string, target, targetsize, copy_null);
2287}
2288
2289Py_UCS4*
2290PyUnicode_AsUCS4Copy(PyObject *string)
2291{
2292 return as_ucs4(string, NULL, 0, 1);
2293}
2294
2295#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00002296
Alexander Belopolsky40018472011-02-26 01:02:56 +00002297PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02002298PyUnicode_FromWideChar(const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002299{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002300 if (w == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002301 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02002302 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson29060642009-01-31 22:14:21 +00002303 PyErr_BadInternalCall();
2304 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002305 }
2306
Martin v. Löwis790465f2008-04-05 20:41:37 +00002307 if (size == -1) {
2308 size = wcslen(w);
2309 }
2310
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002311 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002312}
2313
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002314#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00002315
Walter Dörwald346737f2007-05-31 10:44:43 +00002316static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002317makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
Victor Stinnere215d962012-10-06 23:03:36 +02002318 char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00002319{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002320 *fmt++ = '%';
Benjamin Peterson14339b62009-01-31 16:36:08 +00002321 if (longflag)
2322 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002323 else if (longlongflag) {
2324 /* longlongflag should only ever be nonzero on machines with
2325 HAVE_LONG_LONG defined */
2326#ifdef HAVE_LONG_LONG
2327 char *f = PY_FORMAT_LONG_LONG;
2328 while (*f)
2329 *fmt++ = *f++;
2330#else
2331 /* we shouldn't ever get here */
2332 assert(0);
2333 *fmt++ = 'l';
2334#endif
2335 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002336 else if (size_tflag) {
2337 char *f = PY_FORMAT_SIZE_T;
2338 while (*f)
2339 *fmt++ = *f++;
2340 }
2341 *fmt++ = c;
2342 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00002343}
2344
Victor Stinner15a11362012-10-06 23:48:20 +02002345/* maximum number of characters required for output of %lld or %p.
Victor Stinnere215d962012-10-06 23:03:36 +02002346 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2347 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2348#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
Victor Stinner96865452011-03-01 23:44:09 +00002349
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002350static int
2351unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2352 Py_ssize_t width, Py_ssize_t precision)
2353{
2354 Py_ssize_t length, fill, arglen;
2355 Py_UCS4 maxchar;
2356
2357 if (PyUnicode_READY(str) == -1)
2358 return -1;
2359
2360 length = PyUnicode_GET_LENGTH(str);
2361 if ((precision == -1 || precision >= length)
2362 && width <= length)
2363 return _PyUnicodeWriter_WriteStr(writer, str);
2364
2365 if (precision != -1)
2366 length = Py_MIN(precision, length);
2367
2368 arglen = Py_MAX(length, width);
2369 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2370 maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2371 else
2372 maxchar = writer->maxchar;
2373
2374 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2375 return -1;
2376
2377 if (width > length) {
2378 fill = width - length;
2379 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2380 return -1;
2381 writer->pos += fill;
2382 }
2383
2384 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2385 str, 0, length);
2386 writer->pos += length;
2387 return 0;
2388}
2389
2390static int
2391unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
2392 Py_ssize_t width, Py_ssize_t precision)
2393{
2394 /* UTF-8 */
2395 Py_ssize_t length;
2396 PyObject *unicode;
2397 int res;
2398
2399 length = strlen(str);
2400 if (precision != -1)
2401 length = Py_MIN(length, precision);
2402 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2403 if (unicode == NULL)
2404 return -1;
2405
2406 res = unicode_fromformat_write_str(writer, unicode, width, -1);
2407 Py_DECREF(unicode);
2408 return res;
2409}
2410
Victor Stinner96865452011-03-01 23:44:09 +00002411static const char*
Victor Stinnere215d962012-10-06 23:03:36 +02002412unicode_fromformat_arg(_PyUnicodeWriter *writer,
2413 const char *f, va_list *vargs)
Victor Stinner96865452011-03-01 23:44:09 +00002414{
Victor Stinnere215d962012-10-06 23:03:36 +02002415 const char *p;
2416 Py_ssize_t len;
2417 int zeropad;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002418 Py_ssize_t width;
2419 Py_ssize_t precision;
Victor Stinnere215d962012-10-06 23:03:36 +02002420 int longflag;
2421 int longlongflag;
2422 int size_tflag;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002423 Py_ssize_t fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002424
2425 p = f;
2426 f++;
Victor Stinner4c63a972012-10-06 23:55:33 +02002427 zeropad = 0;
2428 if (*f == '0') {
2429 zeropad = 1;
2430 f++;
2431 }
Victor Stinner96865452011-03-01 23:44:09 +00002432
2433 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002434 width = -1;
2435 if (Py_ISDIGIT((unsigned)*f)) {
2436 width = *f - '0';
Victor Stinner96865452011-03-01 23:44:09 +00002437 f++;
Victor Stinnere215d962012-10-06 23:03:36 +02002438 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002439 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
Victor Stinner3921e902012-10-06 23:05:00 +02002440 PyErr_SetString(PyExc_ValueError,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002441 "width too big");
Victor Stinner3921e902012-10-06 23:05:00 +02002442 return NULL;
2443 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002444 width = (width * 10) + (*f - '0');
Victor Stinnere215d962012-10-06 23:03:36 +02002445 f++;
2446 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002447 }
2448 precision = -1;
2449 if (*f == '.') {
2450 f++;
2451 if (Py_ISDIGIT((unsigned)*f)) {
2452 precision = (*f - '0');
2453 f++;
2454 while (Py_ISDIGIT((unsigned)*f)) {
2455 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2456 PyErr_SetString(PyExc_ValueError,
2457 "precision too big");
2458 return NULL;
2459 }
2460 precision = (precision * 10) + (*f - '0');
2461 f++;
2462 }
2463 }
Victor Stinner96865452011-03-01 23:44:09 +00002464 if (*f == '%') {
2465 /* "%.3%s" => f points to "3" */
2466 f--;
2467 }
2468 }
2469 if (*f == '\0') {
Victor Stinnere215d962012-10-06 23:03:36 +02002470 /* bogus format "%.123" => go backward, f points to "3" */
Victor Stinner96865452011-03-01 23:44:09 +00002471 f--;
2472 }
Victor Stinner96865452011-03-01 23:44:09 +00002473
2474 /* Handle %ld, %lu, %lld and %llu. */
2475 longflag = 0;
2476 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002477 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002478 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002479 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002480 longflag = 1;
2481 ++f;
2482 }
2483#ifdef HAVE_LONG_LONG
2484 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002485 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002486 longlongflag = 1;
2487 f += 2;
2488 }
2489#endif
2490 }
2491 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002492 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002493 size_tflag = 1;
2494 ++f;
2495 }
Victor Stinnere215d962012-10-06 23:03:36 +02002496
2497 if (f[1] == '\0')
2498 writer->overallocate = 0;
2499
2500 switch (*f) {
2501 case 'c':
2502 {
2503 int ordinal = va_arg(*vargs, int);
Victor Stinnerff5a8482012-10-06 23:05:45 +02002504 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Serhiy Storchakac89533f2013-06-23 20:21:16 +03002505 PyErr_SetString(PyExc_OverflowError,
Victor Stinnerff5a8482012-10-06 23:05:45 +02002506 "character argument not in range(0x110000)");
2507 return NULL;
2508 }
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002509 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002510 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002511 break;
2512 }
2513
2514 case 'i':
2515 case 'd':
2516 case 'u':
2517 case 'x':
2518 {
2519 /* used by sprintf */
2520 char fmt[10]; /* should be enough for "%0lld\0" */
Victor Stinner15a11362012-10-06 23:48:20 +02002521 char buffer[MAX_LONG_LONG_CHARS];
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002522 Py_ssize_t arglen;
Victor Stinnere215d962012-10-06 23:03:36 +02002523
2524 if (*f == 'u') {
2525 makefmt(fmt, longflag, longlongflag, size_tflag, *f);
2526
2527 if (longflag)
2528 len = sprintf(buffer, fmt,
2529 va_arg(*vargs, unsigned long));
2530#ifdef HAVE_LONG_LONG
2531 else if (longlongflag)
2532 len = sprintf(buffer, fmt,
2533 va_arg(*vargs, unsigned PY_LONG_LONG));
2534#endif
2535 else if (size_tflag)
2536 len = sprintf(buffer, fmt,
2537 va_arg(*vargs, size_t));
2538 else
2539 len = sprintf(buffer, fmt,
2540 va_arg(*vargs, unsigned int));
2541 }
2542 else if (*f == 'x') {
2543 makefmt(fmt, 0, 0, 0, 'x');
2544 len = sprintf(buffer, fmt, va_arg(*vargs, int));
2545 }
2546 else {
2547 makefmt(fmt, longflag, longlongflag, size_tflag, *f);
2548
2549 if (longflag)
2550 len = sprintf(buffer, fmt,
2551 va_arg(*vargs, long));
2552#ifdef HAVE_LONG_LONG
2553 else if (longlongflag)
2554 len = sprintf(buffer, fmt,
2555 va_arg(*vargs, PY_LONG_LONG));
2556#endif
2557 else if (size_tflag)
2558 len = sprintf(buffer, fmt,
2559 va_arg(*vargs, Py_ssize_t));
2560 else
2561 len = sprintf(buffer, fmt,
2562 va_arg(*vargs, int));
2563 }
2564 assert(len >= 0);
2565
Victor Stinnere215d962012-10-06 23:03:36 +02002566 if (precision < len)
2567 precision = len;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002568
2569 arglen = Py_MAX(precision, width);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002570 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2571 return NULL;
2572
Victor Stinnere215d962012-10-06 23:03:36 +02002573 if (width > precision) {
2574 Py_UCS4 fillchar;
2575 fill = width - precision;
2576 fillchar = zeropad?'0':' ';
Victor Stinner15a11362012-10-06 23:48:20 +02002577 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2578 return NULL;
2579 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002580 }
Victor Stinner15a11362012-10-06 23:48:20 +02002581 if (precision > len) {
Victor Stinnere215d962012-10-06 23:03:36 +02002582 fill = precision - len;
Victor Stinner15a11362012-10-06 23:48:20 +02002583 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2584 return NULL;
2585 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002586 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002587
Victor Stinner4a587072013-11-19 12:54:53 +01002588 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2589 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002590 break;
2591 }
2592
2593 case 'p':
2594 {
2595 char number[MAX_LONG_LONG_CHARS];
2596
2597 len = sprintf(number, "%p", va_arg(*vargs, void*));
2598 assert(len >= 0);
2599
2600 /* %p is ill-defined: ensure leading 0x. */
2601 if (number[1] == 'X')
2602 number[1] = 'x';
2603 else if (number[1] != 'x') {
2604 memmove(number + 2, number,
2605 strlen(number) + 1);
2606 number[0] = '0';
2607 number[1] = 'x';
2608 len += 2;
2609 }
2610
Victor Stinner4a587072013-11-19 12:54:53 +01002611 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002612 return NULL;
2613 break;
2614 }
2615
2616 case 's':
2617 {
2618 /* UTF-8 */
2619 const char *s = va_arg(*vargs, const char*);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002620 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002621 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002622 break;
2623 }
2624
2625 case 'U':
2626 {
2627 PyObject *obj = va_arg(*vargs, PyObject *);
2628 assert(obj && _PyUnicode_CHECK(obj));
2629
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002630 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002631 return NULL;
2632 break;
2633 }
2634
2635 case 'V':
2636 {
2637 PyObject *obj = va_arg(*vargs, PyObject *);
2638 const char *str = va_arg(*vargs, const char *);
Victor Stinnere215d962012-10-06 23:03:36 +02002639 if (obj) {
2640 assert(_PyUnicode_CHECK(obj));
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002641 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002642 return NULL;
2643 }
2644 else {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002645 assert(str != NULL);
2646 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002647 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002648 }
2649 break;
2650 }
2651
2652 case 'S':
2653 {
2654 PyObject *obj = va_arg(*vargs, PyObject *);
2655 PyObject *str;
2656 assert(obj);
2657 str = PyObject_Str(obj);
2658 if (!str)
2659 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002660 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002661 Py_DECREF(str);
2662 return NULL;
2663 }
2664 Py_DECREF(str);
2665 break;
2666 }
2667
2668 case 'R':
2669 {
2670 PyObject *obj = va_arg(*vargs, PyObject *);
2671 PyObject *repr;
2672 assert(obj);
2673 repr = PyObject_Repr(obj);
2674 if (!repr)
2675 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002676 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002677 Py_DECREF(repr);
2678 return NULL;
2679 }
2680 Py_DECREF(repr);
2681 break;
2682 }
2683
2684 case 'A':
2685 {
2686 PyObject *obj = va_arg(*vargs, PyObject *);
2687 PyObject *ascii;
2688 assert(obj);
2689 ascii = PyObject_ASCII(obj);
2690 if (!ascii)
2691 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002692 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002693 Py_DECREF(ascii);
2694 return NULL;
2695 }
2696 Py_DECREF(ascii);
2697 break;
2698 }
2699
2700 case '%':
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002701 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002702 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002703 break;
2704
2705 default:
2706 /* if we stumble upon an unknown formatting code, copy the rest
2707 of the format string to the output string. (we cannot just
2708 skip the code, since there's no way to know what's in the
2709 argument list) */
2710 len = strlen(p);
Victor Stinner4a587072013-11-19 12:54:53 +01002711 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002712 return NULL;
2713 f = p+len;
2714 return f;
2715 }
2716
2717 f++;
Victor Stinner96865452011-03-01 23:44:09 +00002718 return f;
2719}
2720
Walter Dörwaldd2034312007-05-18 16:29:38 +00002721PyObject *
2722PyUnicode_FromFormatV(const char *format, va_list vargs)
2723{
Victor Stinnere215d962012-10-06 23:03:36 +02002724 va_list vargs2;
2725 const char *f;
2726 _PyUnicodeWriter writer;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002727
Victor Stinner8f674cc2013-04-17 23:02:17 +02002728 _PyUnicodeWriter_Init(&writer);
2729 writer.min_length = strlen(format) + 100;
2730 writer.overallocate = 1;
Victor Stinnere215d962012-10-06 23:03:36 +02002731
2732 /* va_list may be an array (of 1 item) on some platforms (ex: AMD64).
2733 Copy it to be able to pass a reference to a subfunction. */
2734 Py_VA_COPY(vargs2, vargs);
2735
2736 for (f = format; *f; ) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002737 if (*f == '%') {
Victor Stinnere215d962012-10-06 23:03:36 +02002738 f = unicode_fromformat_arg(&writer, f, &vargs2);
2739 if (f == NULL)
2740 goto fail;
Victor Stinner1205f272010-09-11 00:54:47 +00002741 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002742 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002743 const char *p;
2744 Py_ssize_t len;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002745
Victor Stinnere215d962012-10-06 23:03:36 +02002746 p = f;
2747 do
2748 {
2749 if ((unsigned char)*p > 127) {
2750 PyErr_Format(PyExc_ValueError,
2751 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2752 "string, got a non-ASCII byte: 0x%02x",
2753 (unsigned char)*p);
2754 return NULL;
2755 }
2756 p++;
2757 }
2758 while (*p != '\0' && *p != '%');
2759 len = p - f;
2760
2761 if (*p == '\0')
2762 writer.overallocate = 0;
Victor Stinner4a587072013-11-19 12:54:53 +01002763
2764 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002765 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02002766
2767 f = p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002768 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002769 }
Victor Stinnere215d962012-10-06 23:03:36 +02002770 return _PyUnicodeWriter_Finish(&writer);
2771
2772 fail:
2773 _PyUnicodeWriter_Dealloc(&writer);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002774 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002775}
2776
Walter Dörwaldd2034312007-05-18 16:29:38 +00002777PyObject *
2778PyUnicode_FromFormat(const char *format, ...)
2779{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002780 PyObject* ret;
2781 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002782
2783#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002784 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002785#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002786 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002787#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002788 ret = PyUnicode_FromFormatV(format, vargs);
2789 va_end(vargs);
2790 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002791}
2792
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002793#ifdef HAVE_WCHAR_H
2794
Victor Stinner5593d8a2010-10-02 11:11:27 +00002795/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2796 convert a Unicode object to a wide character string.
2797
Victor Stinnerd88d9832011-09-06 02:00:05 +02002798 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002799 character) required to convert the unicode object. Ignore size argument.
2800
Victor Stinnerd88d9832011-09-06 02:00:05 +02002801 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002802 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002803 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002804static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002805unicode_aswidechar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002806 wchar_t *w,
2807 Py_ssize_t size)
2808{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002809 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002810 const wchar_t *wstr;
2811
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002812 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002813 if (wstr == NULL)
2814 return -1;
2815
Victor Stinner5593d8a2010-10-02 11:11:27 +00002816 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002817 if (size > res)
2818 size = res + 1;
2819 else
2820 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002821 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002822 return res;
2823 }
2824 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002825 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002826}
2827
2828Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002829PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002830 wchar_t *w,
2831 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002832{
2833 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002834 PyErr_BadInternalCall();
2835 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002836 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002837 return unicode_aswidechar(unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002838}
2839
Victor Stinner137c34c2010-09-29 10:25:54 +00002840wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002841PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002842 Py_ssize_t *size)
2843{
2844 wchar_t* buffer;
2845 Py_ssize_t buflen;
2846
2847 if (unicode == NULL) {
2848 PyErr_BadInternalCall();
2849 return NULL;
2850 }
2851
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002852 buflen = unicode_aswidechar(unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002853 if (buflen == -1)
2854 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002855 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002856 PyErr_NoMemory();
2857 return NULL;
2858 }
2859
Victor Stinner137c34c2010-09-29 10:25:54 +00002860 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2861 if (buffer == NULL) {
2862 PyErr_NoMemory();
2863 return NULL;
2864 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002865 buflen = unicode_aswidechar(unicode, buffer, buflen);
Stefan Krah8528c312012-08-19 21:52:43 +02002866 if (buflen == -1) {
2867 PyMem_FREE(buffer);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002868 return NULL;
Stefan Krah8528c312012-08-19 21:52:43 +02002869 }
Victor Stinner5593d8a2010-10-02 11:11:27 +00002870 if (size != NULL)
2871 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002872 return buffer;
2873}
2874
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002875#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002876
Alexander Belopolsky40018472011-02-26 01:02:56 +00002877PyObject *
2878PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002879{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002880 PyObject *v;
Victor Stinner69ed0f42013-04-09 21:48:24 +02002881 void *data;
2882 int kind;
2883
Victor Stinner8faf8212011-12-08 22:14:11 +01002884 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002885 PyErr_SetString(PyExc_ValueError,
2886 "chr() arg not in range(0x110000)");
2887 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002888 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002889
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002890 if ((Py_UCS4)ordinal < 256)
2891 return get_latin1_char((unsigned char)ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002892
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002893 v = PyUnicode_New(1, ordinal);
2894 if (v == NULL)
2895 return NULL;
Victor Stinner69ed0f42013-04-09 21:48:24 +02002896 kind = PyUnicode_KIND(v);
2897 data = PyUnicode_DATA(v);
2898 PyUnicode_WRITE(kind, data, 0, ordinal);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002899 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002900 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002901}
2902
Alexander Belopolsky40018472011-02-26 01:02:56 +00002903PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02002904PyUnicode_FromObject(PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002905{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002906 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002907 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002908 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05002909 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002910 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002911 Py_INCREF(obj);
2912 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002913 }
2914 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002915 /* For a Unicode subtype that's not a Unicode object,
2916 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002917 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002918 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002919 PyErr_Format(PyExc_TypeError,
2920 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002921 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002922 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002923}
2924
Alexander Belopolsky40018472011-02-26 01:02:56 +00002925PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02002926PyUnicode_FromEncodedObject(PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002927 const char *encoding,
2928 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002929{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002930 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002931 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002932
Guido van Rossumd57fd912000-03-10 22:53:23 +00002933 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002934 PyErr_BadInternalCall();
2935 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002936 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002937
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002938 /* Decoding bytes objects is the most common case and should be fast */
2939 if (PyBytes_Check(obj)) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02002940 if (PyBytes_GET_SIZE(obj) == 0)
2941 _Py_RETURN_UNICODE_EMPTY();
2942 v = PyUnicode_Decode(
2943 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2944 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002945 return v;
2946 }
2947
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002948 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002949 PyErr_SetString(PyExc_TypeError,
2950 "decoding str is not supported");
2951 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002952 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002953
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002954 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2955 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2956 PyErr_Format(PyExc_TypeError,
2957 "coercing to str: need bytes, bytearray "
2958 "or buffer-like object, %.80s found",
2959 Py_TYPE(obj)->tp_name);
2960 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002961 }
Tim Petersced69f82003-09-16 20:30:58 +00002962
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002963 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02002964 PyBuffer_Release(&buffer);
2965 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00002966 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002967
Serhiy Storchaka05997252013-01-26 12:14:02 +02002968 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002969 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002970 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002971}
2972
Victor Stinner600d3be2010-06-10 12:00:55 +00002973/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002974 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2975 1 on success. */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01002976int
2977_Py_normalize_encoding(const char *encoding,
2978 char *lower,
2979 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002980{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002981 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002982 char *l;
2983 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002984
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04002985 if (encoding == NULL) {
Victor Stinner66b32702013-11-07 23:12:23 +01002986 /* 6 == strlen("utf-8") + 1 */
Victor Stinnerdf23e302013-11-07 13:33:36 +01002987 if (lower_len < 6)
2988 return 0;
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04002989 strcpy(lower, "utf-8");
2990 return 1;
2991 }
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002992 e = encoding;
2993 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002994 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002995 while (*e) {
2996 if (l == l_end)
2997 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002998 if (Py_ISUPPER(*e)) {
2999 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003000 }
3001 else if (*e == '_') {
3002 *l++ = '-';
3003 e++;
3004 }
3005 else {
3006 *l++ = *e++;
3007 }
3008 }
3009 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00003010 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00003011}
3012
Alexander Belopolsky40018472011-02-26 01:02:56 +00003013PyObject *
3014PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003015 Py_ssize_t size,
3016 const char *encoding,
3017 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00003018{
3019 PyObject *buffer = NULL, *unicode;
3020 Py_buffer info;
3021 char lower[11]; /* Enough for any encoding shortcut */
3022
Fred Drakee4315f52000-05-09 19:53:39 +00003023 /* Shortcuts for common default encodings */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003024 if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003025 if ((strcmp(lower, "utf-8") == 0) ||
3026 (strcmp(lower, "utf8") == 0))
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003027 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
Victor Stinner37296e82010-06-10 13:36:23 +00003028 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003029 (strcmp(lower, "latin1") == 0) ||
Victor Stinnerfa3ba4c2013-10-29 11:34:05 +01003030 (strcmp(lower, "iso-8859-1") == 0) ||
3031 (strcmp(lower, "iso8859-1") == 0))
Victor Stinner37296e82010-06-10 13:36:23 +00003032 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003033#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00003034 else if (strcmp(lower, "mbcs") == 0)
3035 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003036#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003037 else if (strcmp(lower, "ascii") == 0)
3038 return PyUnicode_DecodeASCII(s, size, errors);
3039 else if (strcmp(lower, "utf-16") == 0)
3040 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3041 else if (strcmp(lower, "utf-32") == 0)
3042 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3043 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003044
3045 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003046 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00003047 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003048 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003049 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003050 if (buffer == NULL)
3051 goto onError;
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003052 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003053 if (unicode == NULL)
3054 goto onError;
3055 if (!PyUnicode_Check(unicode)) {
3056 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003057 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3058 "use codecs.decode() to decode to arbitrary types",
3059 encoding,
3060 Py_TYPE(unicode)->tp_name, Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003061 Py_DECREF(unicode);
3062 goto onError;
3063 }
3064 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003065 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003066
Benjamin Peterson29060642009-01-31 22:14:21 +00003067 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003068 Py_XDECREF(buffer);
3069 return NULL;
3070}
3071
Alexander Belopolsky40018472011-02-26 01:02:56 +00003072PyObject *
3073PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003074 const char *encoding,
3075 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003076{
3077 PyObject *v;
3078
3079 if (!PyUnicode_Check(unicode)) {
3080 PyErr_BadArgument();
3081 goto onError;
3082 }
3083
3084 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003085 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003086
3087 /* Decode via the codec registry */
3088 v = PyCodec_Decode(unicode, encoding, errors);
3089 if (v == NULL)
3090 goto onError;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003091 return unicode_result(v);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003092
Benjamin Peterson29060642009-01-31 22:14:21 +00003093 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003094 return NULL;
3095}
3096
Alexander Belopolsky40018472011-02-26 01:02:56 +00003097PyObject *
3098PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003099 const char *encoding,
3100 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003101{
3102 PyObject *v;
3103
3104 if (!PyUnicode_Check(unicode)) {
3105 PyErr_BadArgument();
3106 goto onError;
3107 }
3108
3109 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003110 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003111
3112 /* Decode via the codec registry */
3113 v = PyCodec_Decode(unicode, encoding, errors);
3114 if (v == NULL)
3115 goto onError;
3116 if (!PyUnicode_Check(v)) {
3117 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003118 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3119 "use codecs.decode() to decode to arbitrary types",
3120 encoding,
3121 Py_TYPE(unicode)->tp_name, Py_TYPE(unicode)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003122 Py_DECREF(v);
3123 goto onError;
3124 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003125 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003126
Benjamin Peterson29060642009-01-31 22:14:21 +00003127 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003128 return NULL;
3129}
3130
Alexander Belopolsky40018472011-02-26 01:02:56 +00003131PyObject *
3132PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003133 Py_ssize_t size,
3134 const char *encoding,
3135 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003136{
3137 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003138
Guido van Rossumd57fd912000-03-10 22:53:23 +00003139 unicode = PyUnicode_FromUnicode(s, size);
3140 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003141 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003142 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3143 Py_DECREF(unicode);
3144 return v;
3145}
3146
Alexander Belopolsky40018472011-02-26 01:02:56 +00003147PyObject *
3148PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003149 const char *encoding,
3150 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003151{
3152 PyObject *v;
3153
3154 if (!PyUnicode_Check(unicode)) {
3155 PyErr_BadArgument();
3156 goto onError;
3157 }
3158
3159 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003160 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003161
3162 /* Encode via the codec registry */
3163 v = PyCodec_Encode(unicode, encoding, errors);
3164 if (v == NULL)
3165 goto onError;
3166 return v;
3167
Benjamin Peterson29060642009-01-31 22:14:21 +00003168 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003169 return NULL;
3170}
3171
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003172static size_t
3173wcstombs_errorpos(const wchar_t *wstr)
3174{
3175 size_t len;
3176#if SIZEOF_WCHAR_T == 2
3177 wchar_t buf[3];
3178#else
3179 wchar_t buf[2];
3180#endif
3181 char outbuf[MB_LEN_MAX];
3182 const wchar_t *start, *previous;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003183
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003184#if SIZEOF_WCHAR_T == 2
3185 buf[2] = 0;
3186#else
3187 buf[1] = 0;
3188#endif
3189 start = wstr;
3190 while (*wstr != L'\0')
3191 {
3192 previous = wstr;
3193#if SIZEOF_WCHAR_T == 2
3194 if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
3195 && Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
3196 {
3197 buf[0] = wstr[0];
3198 buf[1] = wstr[1];
3199 wstr += 2;
3200 }
3201 else {
3202 buf[0] = *wstr;
3203 buf[1] = 0;
3204 wstr++;
3205 }
3206#else
3207 buf[0] = *wstr;
3208 wstr++;
3209#endif
3210 len = wcstombs(outbuf, buf, sizeof(outbuf));
Victor Stinner2f197072011-12-17 07:08:30 +01003211 if (len == (size_t)-1)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003212 return previous - start;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003213 }
3214
3215 /* failed to find the unencodable character */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003216 return 0;
3217}
3218
Victor Stinner1b579672011-12-17 05:47:23 +01003219static int
3220locale_error_handler(const char *errors, int *surrogateescape)
3221{
3222 if (errors == NULL) {
3223 *surrogateescape = 0;
3224 return 0;
3225 }
3226
3227 if (strcmp(errors, "strict") == 0) {
3228 *surrogateescape = 0;
3229 return 0;
3230 }
Victor Stinner8dbd4212012-12-04 09:30:24 +01003231 if (strcmp(errors, "surrogateescape") == 0) {
Victor Stinner1b579672011-12-17 05:47:23 +01003232 *surrogateescape = 1;
3233 return 0;
3234 }
3235 PyErr_Format(PyExc_ValueError,
3236 "only 'strict' and 'surrogateescape' error handlers "
3237 "are supported, not '%s'",
3238 errors);
3239 return -1;
3240}
3241
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003242PyObject *
Victor Stinner1b579672011-12-17 05:47:23 +01003243PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003244{
3245 Py_ssize_t wlen, wlen2;
3246 wchar_t *wstr;
3247 PyObject *bytes = NULL;
3248 char *errmsg;
Raymond Hettingere56666d2013-08-04 11:51:03 -07003249 PyObject *reason = NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003250 PyObject *exc;
3251 size_t error_pos;
Victor Stinner1b579672011-12-17 05:47:23 +01003252 int surrogateescape;
3253
3254 if (locale_error_handler(errors, &surrogateescape) < 0)
3255 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003256
3257 wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3258 if (wstr == NULL)
3259 return NULL;
3260
3261 wlen2 = wcslen(wstr);
3262 if (wlen2 != wlen) {
3263 PyMem_Free(wstr);
3264 PyErr_SetString(PyExc_TypeError, "embedded null character");
3265 return NULL;
3266 }
3267
3268 if (surrogateescape) {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003269 /* "surrogateescape" error handler */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003270 char *str;
3271
3272 str = _Py_wchar2char(wstr, &error_pos);
3273 if (str == NULL) {
3274 if (error_pos == (size_t)-1) {
3275 PyErr_NoMemory();
3276 PyMem_Free(wstr);
3277 return NULL;
3278 }
3279 else {
3280 goto encode_error;
3281 }
3282 }
3283 PyMem_Free(wstr);
3284
3285 bytes = PyBytes_FromString(str);
3286 PyMem_Free(str);
3287 }
3288 else {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003289 /* strict mode */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003290 size_t len, len2;
3291
3292 len = wcstombs(NULL, wstr, 0);
3293 if (len == (size_t)-1) {
Victor Stinner2f197072011-12-17 07:08:30 +01003294 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003295 goto encode_error;
3296 }
3297
3298 bytes = PyBytes_FromStringAndSize(NULL, len);
3299 if (bytes == NULL) {
3300 PyMem_Free(wstr);
3301 return NULL;
3302 }
3303
3304 len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
3305 if (len2 == (size_t)-1 || len2 > len) {
Victor Stinner2f197072011-12-17 07:08:30 +01003306 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003307 goto encode_error;
3308 }
3309 PyMem_Free(wstr);
3310 }
3311 return bytes;
3312
3313encode_error:
3314 errmsg = strerror(errno);
3315 assert(errmsg != NULL);
Victor Stinner2f197072011-12-17 07:08:30 +01003316
3317 if (error_pos == (size_t)-1)
3318 error_pos = wcstombs_errorpos(wstr);
3319
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003320 PyMem_Free(wstr);
3321 Py_XDECREF(bytes);
3322
Victor Stinner2f197072011-12-17 07:08:30 +01003323 if (errmsg != NULL) {
3324 size_t errlen;
3325 wstr = _Py_char2wchar(errmsg, &errlen);
3326 if (wstr != NULL) {
3327 reason = PyUnicode_FromWideChar(wstr, errlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003328 PyMem_RawFree(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003329 } else
3330 errmsg = NULL;
3331 }
3332 if (errmsg == NULL)
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003333 reason = PyUnicode_FromString(
3334 "wcstombs() encountered an unencodable "
3335 "wide character");
3336 if (reason == NULL)
3337 return NULL;
3338
3339 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO",
3340 "locale", unicode,
3341 (Py_ssize_t)error_pos,
3342 (Py_ssize_t)(error_pos+1),
3343 reason);
3344 Py_DECREF(reason);
3345 if (exc != NULL) {
3346 PyCodec_StrictErrors(exc);
3347 Py_XDECREF(exc);
3348 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003349 return NULL;
3350}
3351
Victor Stinnerad158722010-10-27 00:25:46 +00003352PyObject *
3353PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003354{
Victor Stinner99b95382011-07-04 14:23:54 +02003355#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003356 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003357#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003358 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00003359#else
Victor Stinner793b5312011-04-27 00:24:21 +02003360 PyInterpreterState *interp = PyThreadState_GET()->interp;
3361 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3362 cannot use it to encode and decode filenames before it is loaded. Load
3363 the Python codec requires to encode at least its own filename. Use the C
3364 version of the locale codec until the codec registry is initialized and
3365 the Python codec is loaded.
3366
3367 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3368 cannot only rely on it: check also interp->fscodec_initialized for
3369 subinterpreters. */
3370 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003371 return PyUnicode_AsEncodedString(unicode,
3372 Py_FileSystemDefaultEncoding,
3373 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003374 }
3375 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003376 return PyUnicode_EncodeLocale(unicode, "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003377 }
Victor Stinnerad158722010-10-27 00:25:46 +00003378#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003379}
3380
Alexander Belopolsky40018472011-02-26 01:02:56 +00003381PyObject *
3382PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003383 const char *encoding,
3384 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003385{
3386 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00003387 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003388
Guido van Rossumd57fd912000-03-10 22:53:23 +00003389 if (!PyUnicode_Check(unicode)) {
3390 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003391 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003392 }
Fred Drakee4315f52000-05-09 19:53:39 +00003393
Fred Drakee4315f52000-05-09 19:53:39 +00003394 /* Shortcuts for common default encodings */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003395 if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003396 if ((strcmp(lower, "utf-8") == 0) ||
3397 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00003398 {
Victor Stinner2f283c22011-03-02 01:21:46 +00003399 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003400 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00003401 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003402 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00003403 }
Victor Stinner37296e82010-06-10 13:36:23 +00003404 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003405 (strcmp(lower, "latin1") == 0) ||
Victor Stinnerfa3ba4c2013-10-29 11:34:05 +01003406 (strcmp(lower, "iso-8859-1") == 0) ||
3407 (strcmp(lower, "iso8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003408 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003409#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003410 else if (strcmp(lower, "mbcs") == 0)
3411 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003412#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003413 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003414 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00003415 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003416
3417 /* Encode via the codec registry */
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003418 v = _PyCodec_EncodeText(unicode, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003419 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003420 return NULL;
3421
3422 /* The normal path */
3423 if (PyBytes_Check(v))
3424 return v;
3425
3426 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003427 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003428 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003429 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003430
3431 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003432 "encoder %s returned bytearray instead of bytes; "
3433 "use codecs.encode() to encode to arbitrary types",
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003434 encoding);
3435 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003436 Py_DECREF(v);
3437 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003438 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003439
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003440 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3441 Py_DECREF(v);
3442 return b;
3443 }
3444
3445 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003446 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
3447 "use codecs.encode() to encode to arbitrary types",
3448 encoding,
3449 Py_TYPE(v)->tp_name, Py_TYPE(v)->tp_name);
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003450 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003451 return NULL;
3452}
3453
Alexander Belopolsky40018472011-02-26 01:02:56 +00003454PyObject *
3455PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003456 const char *encoding,
3457 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003458{
3459 PyObject *v;
3460
3461 if (!PyUnicode_Check(unicode)) {
3462 PyErr_BadArgument();
3463 goto onError;
3464 }
3465
3466 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003467 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003468
3469 /* Encode via the codec registry */
3470 v = PyCodec_Encode(unicode, encoding, errors);
3471 if (v == NULL)
3472 goto onError;
3473 if (!PyUnicode_Check(v)) {
3474 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003475 "'%.400s' encoder returned '%.400s' instead of 'str'; "
3476 "use codecs.encode() to encode to arbitrary types",
3477 encoding,
3478 Py_TYPE(v)->tp_name, Py_TYPE(v)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003479 Py_DECREF(v);
3480 goto onError;
3481 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003482 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003483
Benjamin Peterson29060642009-01-31 22:14:21 +00003484 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003485 return NULL;
3486}
3487
Victor Stinner2f197072011-12-17 07:08:30 +01003488static size_t
3489mbstowcs_errorpos(const char *str, size_t len)
3490{
3491#ifdef HAVE_MBRTOWC
3492 const char *start = str;
3493 mbstate_t mbs;
3494 size_t converted;
3495 wchar_t ch;
3496
3497 memset(&mbs, 0, sizeof mbs);
3498 while (len)
3499 {
3500 converted = mbrtowc(&ch, (char*)str, len, &mbs);
3501 if (converted == 0)
3502 /* Reached end of string */
3503 break;
3504 if (converted == (size_t)-1 || converted == (size_t)-2) {
3505 /* Conversion error or incomplete character */
3506 return str - start;
3507 }
3508 else {
3509 str += converted;
3510 len -= converted;
3511 }
3512 }
3513 /* failed to find the undecodable byte sequence */
3514 return 0;
3515#endif
3516 return 0;
3517}
3518
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003519PyObject*
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003520PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
Victor Stinner1b579672011-12-17 05:47:23 +01003521 const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003522{
3523 wchar_t smallbuf[256];
3524 size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
3525 wchar_t *wstr;
3526 size_t wlen, wlen2;
3527 PyObject *unicode;
Victor Stinner1b579672011-12-17 05:47:23 +01003528 int surrogateescape;
Victor Stinner2f197072011-12-17 07:08:30 +01003529 size_t error_pos;
3530 char *errmsg;
3531 PyObject *reason, *exc;
Victor Stinner1b579672011-12-17 05:47:23 +01003532
3533 if (locale_error_handler(errors, &surrogateescape) < 0)
3534 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003535
3536 if (str[len] != '\0' || len != strlen(str)) {
3537 PyErr_SetString(PyExc_TypeError, "embedded null character");
3538 return NULL;
3539 }
3540
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003541 if (surrogateescape) {
3542 /* "surrogateescape" error handler */
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003543 wstr = _Py_char2wchar(str, &wlen);
3544 if (wstr == NULL) {
3545 if (wlen == (size_t)-1)
3546 PyErr_NoMemory();
3547 else
3548 PyErr_SetFromErrno(PyExc_OSError);
3549 return NULL;
3550 }
3551
3552 unicode = PyUnicode_FromWideChar(wstr, wlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003553 PyMem_RawFree(wstr);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003554 }
3555 else {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003556 /* strict mode */
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003557#ifndef HAVE_BROKEN_MBSTOWCS
3558 wlen = mbstowcs(NULL, str, 0);
3559#else
3560 wlen = len;
3561#endif
Victor Stinner2f197072011-12-17 07:08:30 +01003562 if (wlen == (size_t)-1)
3563 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003564 if (wlen+1 <= smallbuf_len) {
3565 wstr = smallbuf;
3566 }
3567 else {
3568 if (wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1)
3569 return PyErr_NoMemory();
3570
3571 wstr = PyMem_Malloc((wlen+1) * sizeof(wchar_t));
3572 if (!wstr)
3573 return PyErr_NoMemory();
3574 }
3575
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003576 wlen2 = mbstowcs(wstr, str, wlen+1);
3577 if (wlen2 == (size_t)-1) {
3578 if (wstr != smallbuf)
3579 PyMem_Free(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003580 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003581 }
3582#ifdef HAVE_BROKEN_MBSTOWCS
3583 assert(wlen2 == wlen);
3584#endif
3585 unicode = PyUnicode_FromWideChar(wstr, wlen2);
3586 if (wstr != smallbuf)
3587 PyMem_Free(wstr);
3588 }
3589 return unicode;
Victor Stinner2f197072011-12-17 07:08:30 +01003590
3591decode_error:
3592 errmsg = strerror(errno);
3593 assert(errmsg != NULL);
3594
3595 error_pos = mbstowcs_errorpos(str, len);
3596 if (errmsg != NULL) {
3597 size_t errlen;
3598 wstr = _Py_char2wchar(errmsg, &errlen);
3599 if (wstr != NULL) {
3600 reason = PyUnicode_FromWideChar(wstr, errlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003601 PyMem_RawFree(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003602 } else
3603 errmsg = NULL;
3604 }
3605 if (errmsg == NULL)
3606 reason = PyUnicode_FromString(
3607 "mbstowcs() encountered an invalid multibyte sequence");
3608 if (reason == NULL)
3609 return NULL;
3610
3611 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO",
3612 "locale", str, len,
3613 (Py_ssize_t)error_pos,
3614 (Py_ssize_t)(error_pos+1),
3615 reason);
3616 Py_DECREF(reason);
3617 if (exc != NULL) {
3618 PyCodec_StrictErrors(exc);
3619 Py_XDECREF(exc);
3620 }
3621 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003622}
3623
3624PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003625PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003626{
3627 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner1b579672011-12-17 05:47:23 +01003628 return PyUnicode_DecodeLocaleAndSize(str, size, errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003629}
3630
3631
3632PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003633PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003634 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003635 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3636}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003637
Christian Heimes5894ba72007-11-04 11:43:14 +00003638PyObject*
3639PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3640{
Victor Stinner99b95382011-07-04 14:23:54 +02003641#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003642 return PyUnicode_DecodeMBCS(s, size, NULL);
3643#elif defined(__APPLE__)
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003644 return PyUnicode_DecodeUTF8Stateful(s, size, "surrogateescape", NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003645#else
Victor Stinner793b5312011-04-27 00:24:21 +02003646 PyInterpreterState *interp = PyThreadState_GET()->interp;
3647 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3648 cannot use it to encode and decode filenames before it is loaded. Load
3649 the Python codec requires to encode at least its own filename. Use the C
3650 version of the locale codec until the codec registry is initialized and
3651 the Python codec is loaded.
3652
3653 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3654 cannot only rely on it: check also interp->fscodec_initialized for
3655 subinterpreters. */
3656 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003657 return PyUnicode_Decode(s, size,
3658 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00003659 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003660 }
3661 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003662 return PyUnicode_DecodeLocaleAndSize(s, size, "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003663 }
Victor Stinnerad158722010-10-27 00:25:46 +00003664#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003665}
3666
Martin v. Löwis011e8422009-05-05 04:43:17 +00003667
3668int
Victor Stinnerfe75fb42012-10-23 02:52:18 +02003669_PyUnicode_HasNULChars(PyObject* str)
Antoine Pitrou13348842012-01-29 18:36:34 +01003670{
Victor Stinnerfe75fb42012-10-23 02:52:18 +02003671 Py_ssize_t pos;
Antoine Pitrou13348842012-01-29 18:36:34 +01003672
Victor Stinnerfe75fb42012-10-23 02:52:18 +02003673 if (PyUnicode_READY(str) == -1)
Antoine Pitrou13348842012-01-29 18:36:34 +01003674 return -1;
Victor Stinnerfe75fb42012-10-23 02:52:18 +02003675 pos = findchar(PyUnicode_DATA(str), PyUnicode_KIND(str),
3676 PyUnicode_GET_LENGTH(str), '\0', 1);
3677 if (pos == -1)
3678 return 0;
3679 else
3680 return 1;
Antoine Pitrou13348842012-01-29 18:36:34 +01003681}
3682
Antoine Pitrou13348842012-01-29 18:36:34 +01003683int
Martin v. Löwis011e8422009-05-05 04:43:17 +00003684PyUnicode_FSConverter(PyObject* arg, void* addr)
3685{
3686 PyObject *output = NULL;
3687 Py_ssize_t size;
3688 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003689 if (arg == NULL) {
3690 Py_DECREF(*(PyObject**)addr);
3691 return 1;
3692 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00003693 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00003694 output = arg;
3695 Py_INCREF(output);
3696 }
3697 else {
3698 arg = PyUnicode_FromObject(arg);
3699 if (!arg)
3700 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00003701 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003702 Py_DECREF(arg);
3703 if (!output)
3704 return 0;
3705 if (!PyBytes_Check(output)) {
3706 Py_DECREF(output);
3707 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3708 return 0;
3709 }
3710 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00003711 size = PyBytes_GET_SIZE(output);
3712 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003713 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05003714 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003715 Py_DECREF(output);
3716 return 0;
3717 }
3718 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003719 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003720}
3721
3722
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003723int
3724PyUnicode_FSDecoder(PyObject* arg, void* addr)
3725{
3726 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003727 if (arg == NULL) {
3728 Py_DECREF(*(PyObject**)addr);
3729 return 1;
3730 }
3731 if (PyUnicode_Check(arg)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003732 if (PyUnicode_READY(arg) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003733 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003734 output = arg;
3735 Py_INCREF(output);
3736 }
3737 else {
3738 arg = PyBytes_FromObject(arg);
3739 if (!arg)
3740 return 0;
3741 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3742 PyBytes_GET_SIZE(arg));
3743 Py_DECREF(arg);
3744 if (!output)
3745 return 0;
3746 if (!PyUnicode_Check(output)) {
3747 Py_DECREF(output);
3748 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3749 return 0;
3750 }
3751 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003752 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003753 Py_DECREF(output);
3754 return 0;
3755 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003756 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003757 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003758 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3759 Py_DECREF(output);
3760 return 0;
3761 }
3762 *(PyObject**)addr = output;
3763 return Py_CLEANUP_SUPPORTED;
3764}
3765
3766
Martin v. Löwis5b222132007-06-10 09:51:05 +00003767char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003768PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003769{
Christian Heimesf3863112007-11-22 07:46:41 +00003770 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003771
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003772 if (!PyUnicode_Check(unicode)) {
3773 PyErr_BadArgument();
3774 return NULL;
3775 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003776 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003777 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003778
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003779 if (PyUnicode_UTF8(unicode) == NULL) {
3780 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003781 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3782 if (bytes == NULL)
3783 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003784 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3785 if (_PyUnicode_UTF8(unicode) == NULL) {
Victor Stinnera5afb582013-10-29 01:28:23 +01003786 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003787 Py_DECREF(bytes);
3788 return NULL;
3789 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003790 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3791 Py_MEMCPY(_PyUnicode_UTF8(unicode),
3792 PyBytes_AS_STRING(bytes),
3793 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003794 Py_DECREF(bytes);
3795 }
3796
3797 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003798 *psize = PyUnicode_UTF8_LENGTH(unicode);
3799 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003800}
3801
3802char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003803PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003804{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003805 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3806}
3807
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003808Py_UNICODE *
3809PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3810{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003811 const unsigned char *one_byte;
3812#if SIZEOF_WCHAR_T == 4
3813 const Py_UCS2 *two_bytes;
3814#else
3815 const Py_UCS4 *four_bytes;
3816 const Py_UCS4 *ucs4_end;
3817 Py_ssize_t num_surrogates;
3818#endif
3819 wchar_t *w;
3820 wchar_t *wchar_end;
3821
3822 if (!PyUnicode_Check(unicode)) {
3823 PyErr_BadArgument();
3824 return NULL;
3825 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003826 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003827 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003828 assert(_PyUnicode_KIND(unicode) != 0);
3829 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003830
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003831 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003832#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003833 four_bytes = PyUnicode_4BYTE_DATA(unicode);
3834 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003835 num_surrogates = 0;
3836
3837 for (; four_bytes < ucs4_end; ++four_bytes) {
3838 if (*four_bytes > 0xFFFF)
3839 ++num_surrogates;
3840 }
3841
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003842 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
3843 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
3844 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003845 PyErr_NoMemory();
3846 return NULL;
3847 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003848 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003849
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003850 w = _PyUnicode_WSTR(unicode);
3851 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
3852 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003853 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3854 if (*four_bytes > 0xFFFF) {
Victor Stinner8faf8212011-12-08 22:14:11 +01003855 assert(*four_bytes <= MAX_UNICODE);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003856 /* encode surrogate pair in this case */
Victor Stinner551ac952011-11-29 22:58:13 +01003857 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
3858 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003859 }
3860 else
3861 *w = *four_bytes;
3862
3863 if (w > wchar_end) {
3864 assert(0 && "Miscalculated string end");
3865 }
3866 }
3867 *w = 0;
3868#else
3869 /* sizeof(wchar_t) == 4 */
3870 Py_FatalError("Impossible unicode object state, wstr and str "
3871 "should share memory already.");
3872 return NULL;
3873#endif
3874 }
3875 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003876 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3877 (_PyUnicode_LENGTH(unicode) + 1));
3878 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003879 PyErr_NoMemory();
3880 return NULL;
3881 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003882 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
3883 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
3884 w = _PyUnicode_WSTR(unicode);
3885 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003886
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003887 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3888 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003889 for (; w < wchar_end; ++one_byte, ++w)
3890 *w = *one_byte;
3891 /* null-terminate the wstr */
3892 *w = 0;
3893 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003894 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003895#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003896 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003897 for (; w < wchar_end; ++two_bytes, ++w)
3898 *w = *two_bytes;
3899 /* null-terminate the wstr */
3900 *w = 0;
3901#else
3902 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003903 PyObject_FREE(_PyUnicode_WSTR(unicode));
3904 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003905 Py_FatalError("Impossible unicode object state, wstr "
3906 "and str should share memory already.");
3907 return NULL;
3908#endif
3909 }
3910 else {
3911 assert(0 && "This should never happen.");
3912 }
3913 }
3914 }
3915 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003916 *size = PyUnicode_WSTR_LENGTH(unicode);
3917 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003918}
3919
Alexander Belopolsky40018472011-02-26 01:02:56 +00003920Py_UNICODE *
3921PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003922{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003923 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003924}
3925
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003926
Alexander Belopolsky40018472011-02-26 01:02:56 +00003927Py_ssize_t
3928PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003929{
3930 if (!PyUnicode_Check(unicode)) {
3931 PyErr_BadArgument();
3932 goto onError;
3933 }
3934 return PyUnicode_GET_SIZE(unicode);
3935
Benjamin Peterson29060642009-01-31 22:14:21 +00003936 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003937 return -1;
3938}
3939
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003940Py_ssize_t
3941PyUnicode_GetLength(PyObject *unicode)
3942{
Victor Stinner07621332012-06-16 04:53:46 +02003943 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003944 PyErr_BadArgument();
3945 return -1;
3946 }
Victor Stinner07621332012-06-16 04:53:46 +02003947 if (PyUnicode_READY(unicode) == -1)
3948 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003949 return PyUnicode_GET_LENGTH(unicode);
3950}
3951
3952Py_UCS4
3953PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3954{
Victor Stinner69ed0f42013-04-09 21:48:24 +02003955 void *data;
3956 int kind;
3957
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003958 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3959 PyErr_BadArgument();
3960 return (Py_UCS4)-1;
3961 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01003962 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003963 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003964 return (Py_UCS4)-1;
3965 }
Victor Stinner69ed0f42013-04-09 21:48:24 +02003966 data = PyUnicode_DATA(unicode);
3967 kind = PyUnicode_KIND(unicode);
3968 return PyUnicode_READ(kind, data, index);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003969}
3970
3971int
3972PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3973{
3974 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003975 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003976 return -1;
3977 }
Victor Stinner488fa492011-12-12 00:01:39 +01003978 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01003979 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003980 PyErr_SetString(PyExc_IndexError, "string index out of range");
3981 return -1;
3982 }
Victor Stinner488fa492011-12-12 00:01:39 +01003983 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02003984 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01003985 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
3986 PyErr_SetString(PyExc_ValueError, "character out of range");
3987 return -1;
3988 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003989 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3990 index, ch);
3991 return 0;
3992}
3993
Alexander Belopolsky40018472011-02-26 01:02:56 +00003994const char *
3995PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00003996{
Victor Stinner42cb4622010-09-01 19:39:01 +00003997 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00003998}
3999
Victor Stinner554f3f02010-06-16 23:33:54 +00004000/* create or adjust a UnicodeDecodeError */
4001static void
4002make_decode_exception(PyObject **exceptionObject,
4003 const char *encoding,
4004 const char *input, Py_ssize_t length,
4005 Py_ssize_t startpos, Py_ssize_t endpos,
4006 const char *reason)
4007{
4008 if (*exceptionObject == NULL) {
4009 *exceptionObject = PyUnicodeDecodeError_Create(
4010 encoding, input, length, startpos, endpos, reason);
4011 }
4012 else {
4013 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4014 goto onError;
4015 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4016 goto onError;
4017 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4018 goto onError;
4019 }
4020 return;
4021
4022onError:
4023 Py_DECREF(*exceptionObject);
4024 *exceptionObject = NULL;
4025}
4026
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004027#ifdef HAVE_MBCS
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004028/* error handling callback helper:
4029 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00004030 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004031 and adjust various state variables.
4032 return 0 on success, -1 on error
4033*/
4034
Alexander Belopolsky40018472011-02-26 01:02:56 +00004035static int
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004036unicode_decode_call_errorhandler_wchar(
4037 const char *errors, PyObject **errorHandler,
4038 const char *encoding, const char *reason,
4039 const char **input, const char **inend, Py_ssize_t *startinpos,
4040 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4041 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004042{
Benjamin Peterson142957c2008-07-04 19:55:29 +00004043 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004044
4045 PyObject *restuple = NULL;
4046 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01004047 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004048 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004049 Py_ssize_t requiredsize;
4050 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004051 PyObject *inputobj = NULL;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004052 wchar_t *repwstr;
4053 Py_ssize_t repwlen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004054
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004055 assert (_PyUnicode_KIND(*output) == PyUnicode_WCHAR_KIND);
4056 outsize = _PyUnicode_WSTR_LENGTH(*output);
Victor Stinner596a6c42011-11-09 00:02:18 +01004057
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004058 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004059 *errorHandler = PyCodec_LookupError(errors);
4060 if (*errorHandler == NULL)
4061 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004062 }
4063
Victor Stinner554f3f02010-06-16 23:33:54 +00004064 make_decode_exception(exceptionObject,
4065 encoding,
4066 *input, *inend - *input,
4067 *startinpos, *endinpos,
4068 reason);
4069 if (*exceptionObject == NULL)
4070 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004071
4072 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4073 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004074 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004075 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00004076 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004077 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004078 }
4079 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004080 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004081
4082 /* Copy back the bytes variables, which might have been modified by the
4083 callback */
4084 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4085 if (!inputobj)
4086 goto onError;
4087 if (!PyBytes_Check(inputobj)) {
4088 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
4089 }
4090 *input = PyBytes_AS_STRING(inputobj);
4091 insize = PyBytes_GET_SIZE(inputobj);
4092 *inend = *input + insize;
4093 /* we can DECREF safely, as the exception has another reference,
4094 so the object won't go away. */
4095 Py_DECREF(inputobj);
4096
4097 if (newpos<0)
4098 newpos = insize+newpos;
4099 if (newpos<0 || newpos>insize) {
4100 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4101 goto onError;
4102 }
4103
4104 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4105 if (repwstr == NULL)
4106 goto onError;
4107 /* need more space? (at least enough for what we
4108 have+the replacement+the rest of the string (starting
4109 at the new input position), so we won't have to check space
4110 when there are no errors in the rest of the string) */
4111 requiredsize = *outpos + repwlen + insize-newpos;
4112 if (requiredsize > outsize) {
4113 if (requiredsize < 2*outsize)
4114 requiredsize = 2*outsize;
4115 if (unicode_resize(output, requiredsize) < 0)
4116 goto onError;
4117 }
4118 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4119 *outpos += repwlen;
4120
4121 *endinpos = newpos;
4122 *inptr = *input + newpos;
4123
4124 /* we made it! */
4125 Py_XDECREF(restuple);
4126 return 0;
4127
4128 onError:
4129 Py_XDECREF(restuple);
4130 return -1;
4131}
4132#endif /* HAVE_MBCS */
4133
4134static int
4135unicode_decode_call_errorhandler_writer(
4136 const char *errors, PyObject **errorHandler,
4137 const char *encoding, const char *reason,
4138 const char **input, const char **inend, Py_ssize_t *startinpos,
4139 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4140 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4141{
4142 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
4143
4144 PyObject *restuple = NULL;
4145 PyObject *repunicode = NULL;
4146 Py_ssize_t insize;
4147 Py_ssize_t newpos;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004148 Py_ssize_t replen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004149 PyObject *inputobj = NULL;
4150
4151 if (*errorHandler == NULL) {
4152 *errorHandler = PyCodec_LookupError(errors);
4153 if (*errorHandler == NULL)
4154 goto onError;
4155 }
4156
4157 make_decode_exception(exceptionObject,
4158 encoding,
4159 *input, *inend - *input,
4160 *startinpos, *endinpos,
4161 reason);
4162 if (*exceptionObject == NULL)
4163 goto onError;
4164
4165 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4166 if (restuple == NULL)
4167 goto onError;
4168 if (!PyTuple_Check(restuple)) {
4169 PyErr_SetString(PyExc_TypeError, &argparse[4]);
4170 goto onError;
4171 }
4172 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004173 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004174
4175 /* Copy back the bytes variables, which might have been modified by the
4176 callback */
4177 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4178 if (!inputobj)
4179 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00004180 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004181 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00004182 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004183 *input = PyBytes_AS_STRING(inputobj);
4184 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004185 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004186 /* we can DECREF safely, as the exception has another reference,
4187 so the object won't go away. */
4188 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004189
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004190 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004191 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004192 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004193 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4194 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004195 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004196
Victor Stinner8f674cc2013-04-17 23:02:17 +02004197 if (PyUnicode_READY(repunicode) < 0)
4198 goto onError;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004199 replen = PyUnicode_GET_LENGTH(repunicode);
4200 writer->min_length += replen;
4201 if (replen > 1)
Victor Stinner8f674cc2013-04-17 23:02:17 +02004202 writer->overallocate = 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004203 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
Victor Stinner376cfa12013-04-17 23:58:16 +02004204 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004205
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004206 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004207 *inptr = *input + newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004208
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004209 /* we made it! */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004210 Py_XDECREF(restuple);
4211 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004212
Benjamin Peterson29060642009-01-31 22:14:21 +00004213 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004214 Py_XDECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004215 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004216}
4217
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004218/* --- UTF-7 Codec -------------------------------------------------------- */
4219
Antoine Pitrou244651a2009-05-04 18:56:13 +00004220/* See RFC2152 for details. We encode conservatively and decode liberally. */
4221
4222/* Three simple macros defining base-64. */
4223
4224/* Is c a base-64 character? */
4225
4226#define IS_BASE64(c) \
4227 (((c) >= 'A' && (c) <= 'Z') || \
4228 ((c) >= 'a' && (c) <= 'z') || \
4229 ((c) >= '0' && (c) <= '9') || \
4230 (c) == '+' || (c) == '/')
4231
4232/* given that c is a base-64 character, what is its base-64 value? */
4233
4234#define FROM_BASE64(c) \
4235 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4236 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4237 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4238 (c) == '+' ? 62 : 63)
4239
4240/* What is the base-64 character of the bottom 6 bits of n? */
4241
4242#define TO_BASE64(n) \
4243 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4244
4245/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4246 * decoded as itself. We are permissive on decoding; the only ASCII
4247 * byte not decoding to itself is the + which begins a base64
4248 * string. */
4249
4250#define DECODE_DIRECT(c) \
4251 ((c) <= 127 && (c) != '+')
4252
4253/* The UTF-7 encoder treats ASCII characters differently according to
4254 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4255 * the above). See RFC2152. This array identifies these different
4256 * sets:
4257 * 0 : "Set D"
4258 * alphanumeric and '(),-./:?
4259 * 1 : "Set O"
4260 * !"#$%&*;<=>@[]^_`{|}
4261 * 2 : "whitespace"
4262 * ht nl cr sp
4263 * 3 : special (must be base64 encoded)
4264 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4265 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004266
Tim Petersced69f82003-09-16 20:30:58 +00004267static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004268char utf7_category[128] = {
4269/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4270 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4271/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4272 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4273/* sp ! " # $ % & ' ( ) * + , - . / */
4274 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4275/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4276 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4277/* @ A B C D E F G H I J K L M N O */
4278 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4279/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4280 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4281/* ` a b c d e f g h i j k l m n o */
4282 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4283/* p q r s t u v w x y z { | } ~ del */
4284 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004285};
4286
Antoine Pitrou244651a2009-05-04 18:56:13 +00004287/* ENCODE_DIRECT: this character should be encoded as itself. The
4288 * answer depends on whether we are encoding set O as itself, and also
4289 * on whether we are encoding whitespace as itself. RFC2152 makes it
4290 * clear that the answers to these questions vary between
4291 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004292
Antoine Pitrou244651a2009-05-04 18:56:13 +00004293#define ENCODE_DIRECT(c, directO, directWS) \
4294 ((c) < 128 && (c) > 0 && \
4295 ((utf7_category[(c)] == 0) || \
4296 (directWS && (utf7_category[(c)] == 2)) || \
4297 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004298
Alexander Belopolsky40018472011-02-26 01:02:56 +00004299PyObject *
4300PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004301 Py_ssize_t size,
4302 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004303{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004304 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4305}
4306
Antoine Pitrou244651a2009-05-04 18:56:13 +00004307/* The decoder. The only state we preserve is our read position,
4308 * i.e. how many characters we have consumed. So if we end in the
4309 * middle of a shift sequence we have to back off the read position
4310 * and the output to the beginning of the sequence, otherwise we lose
4311 * all the shift state (seen bits, number of bits seen, high
4312 * surrogate). */
4313
Alexander Belopolsky40018472011-02-26 01:02:56 +00004314PyObject *
4315PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004316 Py_ssize_t size,
4317 const char *errors,
4318 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004319{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004320 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004321 Py_ssize_t startinpos;
4322 Py_ssize_t endinpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004323 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004324 _PyUnicodeWriter writer;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004325 const char *errmsg = "";
4326 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004327 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004328 unsigned int base64bits = 0;
4329 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004330 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004331 PyObject *errorHandler = NULL;
4332 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004333
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004334 if (size == 0) {
4335 if (consumed)
4336 *consumed = 0;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004337 _Py_RETURN_UNICODE_EMPTY();
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004338 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004339
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004340 /* Start off assuming it's all ASCII. Widen later as necessary. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02004341 _PyUnicodeWriter_Init(&writer);
4342 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004343
4344 shiftOutStart = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004345 e = s + size;
4346
4347 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004348 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004349 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004350 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004351
Antoine Pitrou244651a2009-05-04 18:56:13 +00004352 if (inShift) { /* in a base-64 section */
4353 if (IS_BASE64(ch)) { /* consume a base-64 character */
4354 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4355 base64bits += 6;
4356 s++;
4357 if (base64bits >= 16) {
4358 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004359 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004360 base64bits -= 16;
4361 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004362 assert(outCh <= 0xffff);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004363 if (surrogate) {
4364 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004365 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4366 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004367 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004368 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004369 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004370 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004371 }
4372 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004373 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004374 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004375 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004376 }
4377 }
Victor Stinner551ac952011-11-29 22:58:13 +01004378 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004379 /* first surrogate */
4380 surrogate = outCh;
4381 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004382 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004383 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004384 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004385 }
4386 }
4387 }
4388 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004389 inShift = 0;
4390 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004391 if (surrogate) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004392 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004393 goto onError;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004394 surrogate = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004395 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004396 if (base64bits > 0) { /* left-over bits */
4397 if (base64bits >= 6) {
4398 /* We've seen at least one base-64 character */
4399 errmsg = "partial character in shift sequence";
4400 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004401 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004402 else {
4403 /* Some bits remain; they should be zero */
4404 if (base64buffer != 0) {
4405 errmsg = "non-zero padding bits in shift sequence";
4406 goto utf7Error;
4407 }
4408 }
4409 }
4410 if (ch != '-') {
4411 /* '-' is absorbed; other terminating
4412 characters are preserved */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004413 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004414 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004415 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004416 }
4417 }
4418 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004419 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004420 s++; /* consume '+' */
4421 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004422 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004423 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004424 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004425 }
4426 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004427 inShift = 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004428 shiftOutStart = writer.pos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004429 base64bits = 0;
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004430 base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004431 }
4432 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004433 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004434 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004435 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004436 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004437 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004438 else {
4439 startinpos = s-starts;
4440 s++;
4441 errmsg = "unexpected special character";
4442 goto utf7Error;
4443 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004444 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004445utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004446 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004447 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004448 errors, &errorHandler,
4449 "utf7", errmsg,
4450 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004451 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004452 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004453 }
4454
Antoine Pitrou244651a2009-05-04 18:56:13 +00004455 /* end of string */
4456
4457 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4458 /* if we're in an inconsistent state, that's an error */
4459 if (surrogate ||
4460 (base64bits >= 6) ||
4461 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004462 endinpos = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004463 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrou244651a2009-05-04 18:56:13 +00004464 errors, &errorHandler,
4465 "utf7", "unterminated shift sequence",
4466 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004467 &writer))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004468 goto onError;
4469 if (s < e)
4470 goto restart;
4471 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004472 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004473
4474 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004475 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004476 if (inShift) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004477 writer.pos = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004478 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004479 }
4480 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004481 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004482 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004483 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004484
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004485 Py_XDECREF(errorHandler);
4486 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004487 return _PyUnicodeWriter_Finish(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004488
Benjamin Peterson29060642009-01-31 22:14:21 +00004489 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004490 Py_XDECREF(errorHandler);
4491 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004492 _PyUnicodeWriter_Dealloc(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004493 return NULL;
4494}
4495
4496
Alexander Belopolsky40018472011-02-26 01:02:56 +00004497PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004498_PyUnicode_EncodeUTF7(PyObject *str,
4499 int base64SetO,
4500 int base64WhiteSpace,
4501 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004502{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004503 int kind;
4504 void *data;
4505 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004506 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004507 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004508 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004509 unsigned int base64bits = 0;
4510 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004511 char * out;
4512 char * start;
4513
Benjamin Petersonbac79492012-01-14 13:34:47 -05004514 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004515 return NULL;
4516 kind = PyUnicode_KIND(str);
4517 data = PyUnicode_DATA(str);
4518 len = PyUnicode_GET_LENGTH(str);
4519
4520 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004521 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004522
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004523 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004524 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004525 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004526 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004527 if (v == NULL)
4528 return NULL;
4529
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004530 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004531 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004532 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004533
Antoine Pitrou244651a2009-05-04 18:56:13 +00004534 if (inShift) {
4535 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4536 /* shifting out */
4537 if (base64bits) { /* output remaining bits */
4538 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4539 base64buffer = 0;
4540 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004541 }
4542 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004543 /* Characters not in the BASE64 set implicitly unshift the sequence
4544 so no '-' is required, except if the character is itself a '-' */
4545 if (IS_BASE64(ch) || ch == '-') {
4546 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004547 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004548 *out++ = (char) ch;
4549 }
4550 else {
4551 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004552 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004553 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004554 else { /* not in a shift sequence */
4555 if (ch == '+') {
4556 *out++ = '+';
4557 *out++ = '-';
4558 }
4559 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4560 *out++ = (char) ch;
4561 }
4562 else {
4563 *out++ = '+';
4564 inShift = 1;
4565 goto encode_char;
4566 }
4567 }
4568 continue;
4569encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004570 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004571 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004572
Antoine Pitrou244651a2009-05-04 18:56:13 +00004573 /* code first surrogate */
4574 base64bits += 16;
Victor Stinner76df43d2012-10-30 01:42:39 +01004575 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004576 while (base64bits >= 6) {
4577 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4578 base64bits -= 6;
4579 }
4580 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004581 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004582 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004583 base64bits += 16;
4584 base64buffer = (base64buffer << 16) | ch;
4585 while (base64bits >= 6) {
4586 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4587 base64bits -= 6;
4588 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004589 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004590 if (base64bits)
4591 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4592 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004593 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004594 if (_PyBytes_Resize(&v, out - start) < 0)
4595 return NULL;
4596 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004597}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004598PyObject *
4599PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4600 Py_ssize_t size,
4601 int base64SetO,
4602 int base64WhiteSpace,
4603 const char *errors)
4604{
4605 PyObject *result;
4606 PyObject *tmp = PyUnicode_FromUnicode(s, size);
4607 if (tmp == NULL)
4608 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004609 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004610 base64WhiteSpace, errors);
4611 Py_DECREF(tmp);
4612 return result;
4613}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004614
Antoine Pitrou244651a2009-05-04 18:56:13 +00004615#undef IS_BASE64
4616#undef FROM_BASE64
4617#undef TO_BASE64
4618#undef DECODE_DIRECT
4619#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004620
Guido van Rossumd57fd912000-03-10 22:53:23 +00004621/* --- UTF-8 Codec -------------------------------------------------------- */
4622
Alexander Belopolsky40018472011-02-26 01:02:56 +00004623PyObject *
4624PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004625 Py_ssize_t size,
4626 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004627{
Walter Dörwald69652032004-09-07 20:24:22 +00004628 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4629}
4630
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004631#include "stringlib/asciilib.h"
4632#include "stringlib/codecs.h"
4633#include "stringlib/undef.h"
4634
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004635#include "stringlib/ucs1lib.h"
4636#include "stringlib/codecs.h"
4637#include "stringlib/undef.h"
4638
4639#include "stringlib/ucs2lib.h"
4640#include "stringlib/codecs.h"
4641#include "stringlib/undef.h"
4642
4643#include "stringlib/ucs4lib.h"
4644#include "stringlib/codecs.h"
4645#include "stringlib/undef.h"
4646
Antoine Pitrouab868312009-01-10 15:40:25 +00004647/* Mask to quickly check whether a C 'long' contains a
4648 non-ASCII, UTF8-encoded char. */
4649#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004650# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004651#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004652# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004653#else
4654# error C 'long' size should be either 4 or 8!
4655#endif
4656
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004657static Py_ssize_t
4658ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004659{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004660 const char *p = start;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004661 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004662
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004663 /*
4664 * Issue #17237: m68k is a bit different from most architectures in
4665 * that objects do not use "natural alignment" - for example, int and
4666 * long are only aligned at 2-byte boundaries. Therefore the assert()
4667 * won't work; also, tests have shown that skipping the "optimised
4668 * version" will even speed up m68k.
4669 */
4670#if !defined(__m68k__)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004671#if SIZEOF_LONG <= SIZEOF_VOID_P
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004672 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4673 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004674 /* Fast path, see in STRINGLIB(utf8_decode) for
4675 an explanation. */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004676 /* Help allocation */
4677 const char *_p = p;
4678 Py_UCS1 * q = dest;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004679 while (_p < aligned_end) {
4680 unsigned long value = *(const unsigned long *) _p;
4681 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00004682 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004683 *((unsigned long *)q) = value;
4684 _p += SIZEOF_LONG;
4685 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004686 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004687 p = _p;
4688 while (p < end) {
4689 if ((unsigned char)*p & 0x80)
4690 break;
4691 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004692 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004693 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004694 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004695#endif
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004696#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004697 while (p < end) {
4698 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4699 for an explanation. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004700 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004701 /* Help allocation */
4702 const char *_p = p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004703 while (_p < aligned_end) {
4704 unsigned long value = *(unsigned long *) _p;
4705 if (value & ASCII_CHAR_MASK)
4706 break;
4707 _p += SIZEOF_LONG;
4708 }
4709 p = _p;
4710 if (_p == end)
4711 break;
4712 }
4713 if ((unsigned char)*p & 0x80)
4714 break;
4715 ++p;
4716 }
4717 memcpy(dest, start, p - start);
4718 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004719}
Antoine Pitrouab868312009-01-10 15:40:25 +00004720
Victor Stinner785938e2011-12-11 20:09:03 +01004721PyObject *
4722PyUnicode_DecodeUTF8Stateful(const char *s,
4723 Py_ssize_t size,
4724 const char *errors,
4725 Py_ssize_t *consumed)
4726{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004727 _PyUnicodeWriter writer;
Victor Stinner785938e2011-12-11 20:09:03 +01004728 const char *starts = s;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004729 const char *end = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004730
4731 Py_ssize_t startinpos;
4732 Py_ssize_t endinpos;
4733 const char *errmsg = "";
4734 PyObject *errorHandler = NULL;
4735 PyObject *exc = NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004736
4737 if (size == 0) {
4738 if (consumed)
4739 *consumed = 0;
Serhiy Storchaka678db842013-01-26 12:16:36 +02004740 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner785938e2011-12-11 20:09:03 +01004741 }
4742
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004743 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4744 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner785938e2011-12-11 20:09:03 +01004745 if (consumed)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004746 *consumed = 1;
4747 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01004748 }
4749
Victor Stinner8f674cc2013-04-17 23:02:17 +02004750 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02004751 writer.min_length = size;
4752 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004753 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004754
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004755 writer.pos = ascii_decode(s, end, writer.data);
4756 s += writer.pos;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004757 while (s < end) {
4758 Py_UCS4 ch;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004759 int kind = writer.kind;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004760 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004761 if (PyUnicode_IS_ASCII(writer.buffer))
4762 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004763 else
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004764 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004765 } else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004766 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004767 } else {
4768 assert(kind == PyUnicode_4BYTE_KIND);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004769 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004770 }
4771
4772 switch (ch) {
4773 case 0:
4774 if (s == end || consumed)
4775 goto End;
4776 errmsg = "unexpected end of data";
4777 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004778 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004779 break;
4780 case 1:
4781 errmsg = "invalid start byte";
4782 startinpos = s - starts;
4783 endinpos = startinpos + 1;
4784 break;
4785 case 2:
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004786 case 3:
4787 case 4:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004788 errmsg = "invalid continuation byte";
4789 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004790 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004791 break;
4792 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004793 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004794 goto onError;
4795 continue;
4796 }
4797
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004798 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004799 errors, &errorHandler,
4800 "utf-8", errmsg,
4801 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004802 &writer))
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004803 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004804 }
4805
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004806End:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004807 if (consumed)
4808 *consumed = s - starts;
4809
4810 Py_XDECREF(errorHandler);
4811 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004812 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004813
4814onError:
4815 Py_XDECREF(errorHandler);
4816 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004817 _PyUnicodeWriter_Dealloc(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004818 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004819}
4820
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004821#ifdef __APPLE__
4822
4823/* Simplified UTF-8 decoder using surrogateescape error handler,
Victor Stinner0d92c4f2012-11-12 23:32:21 +01004824 used to decode the command line arguments on Mac OS X.
4825
4826 Return a pointer to a newly allocated wide character string (use
Victor Stinner6f8eeee2013-07-07 22:57:45 +02004827 PyMem_RawFree() to free the memory), or NULL on memory allocation error. */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004828
4829wchar_t*
4830_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4831{
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004832 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004833 wchar_t *unicode;
4834 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004835
4836 /* Note: size will always be longer than the resulting Unicode
4837 character count */
Victor Stinner0d92c4f2012-11-12 23:32:21 +01004838 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1))
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004839 return NULL;
Victor Stinner6f8eeee2013-07-07 22:57:45 +02004840 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004841 if (!unicode)
4842 return NULL;
4843
4844 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004845 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004846 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004847 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004848 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004849#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004850 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004851#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004852 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004853#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004854 if (ch > 0xFF) {
4855#if SIZEOF_WCHAR_T == 4
4856 assert(0);
4857#else
4858 assert(Py_UNICODE_IS_SURROGATE(ch));
4859 /* compute and append the two surrogates: */
4860 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
4861 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
4862#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004863 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004864 else {
4865 if (!ch && s == e)
4866 break;
4867 /* surrogateescape */
4868 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
4869 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004870 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004871 unicode[outpos] = L'\0';
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004872 return unicode;
4873}
4874
4875#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00004876
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004877/* Primary internal function which creates utf8 encoded bytes objects.
4878
4879 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00004880 and allocate exactly as much space needed at the end. Else allocate the
4881 maximum possible needed (4 result bytes per Unicode character), and return
4882 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004883*/
Tim Peters7e3d9612002-04-21 03:26:37 +00004884PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01004885_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004886{
Victor Stinner6099a032011-12-18 14:22:26 +01004887 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004888 void *data;
4889 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00004890
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004891 if (!PyUnicode_Check(unicode)) {
4892 PyErr_BadArgument();
4893 return NULL;
4894 }
4895
4896 if (PyUnicode_READY(unicode) == -1)
4897 return NULL;
4898
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004899 if (PyUnicode_UTF8(unicode))
4900 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4901 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004902
4903 kind = PyUnicode_KIND(unicode);
4904 data = PyUnicode_DATA(unicode);
4905 size = PyUnicode_GET_LENGTH(unicode);
4906
Benjamin Petersonead6b532011-12-20 17:23:42 -06004907 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01004908 default:
4909 assert(0);
4910 case PyUnicode_1BYTE_KIND:
4911 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
4912 assert(!PyUnicode_IS_ASCII(unicode));
4913 return ucs1lib_utf8_encoder(unicode, data, size, errors);
4914 case PyUnicode_2BYTE_KIND:
4915 return ucs2lib_utf8_encoder(unicode, data, size, errors);
4916 case PyUnicode_4BYTE_KIND:
4917 return ucs4lib_utf8_encoder(unicode, data, size, errors);
Tim Peters602f7402002-04-27 18:03:26 +00004918 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004919}
4920
Alexander Belopolsky40018472011-02-26 01:02:56 +00004921PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004922PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4923 Py_ssize_t size,
4924 const char *errors)
4925{
4926 PyObject *v, *unicode;
4927
4928 unicode = PyUnicode_FromUnicode(s, size);
4929 if (unicode == NULL)
4930 return NULL;
4931 v = _PyUnicode_AsUTF8String(unicode, errors);
4932 Py_DECREF(unicode);
4933 return v;
4934}
4935
4936PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004937PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004938{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004939 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004940}
4941
Walter Dörwald41980ca2007-08-16 21:55:45 +00004942/* --- UTF-32 Codec ------------------------------------------------------- */
4943
4944PyObject *
4945PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004946 Py_ssize_t size,
4947 const char *errors,
4948 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004949{
4950 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4951}
4952
4953PyObject *
4954PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004955 Py_ssize_t size,
4956 const char *errors,
4957 int *byteorder,
4958 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004959{
4960 const char *starts = s;
4961 Py_ssize_t startinpos;
4962 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004963 _PyUnicodeWriter writer;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004964 const unsigned char *q, *e;
Victor Stinnere64322e2012-10-30 23:12:47 +01004965 int le, bo = 0; /* assume native ordering by default */
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02004966 const char *encoding;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004967 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004968 PyObject *errorHandler = NULL;
4969 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004970
Walter Dörwald41980ca2007-08-16 21:55:45 +00004971 q = (unsigned char *)s;
4972 e = q + size;
4973
4974 if (byteorder)
4975 bo = *byteorder;
4976
4977 /* Check for BOM marks (U+FEFF) in the input and adjust current
4978 byte order setting accordingly. In native mode, the leading BOM
4979 mark is skipped, in all other modes, it is copied to the output
4980 stream as-is (giving a ZWNBSP character). */
Victor Stinnere64322e2012-10-30 23:12:47 +01004981 if (bo == 0 && size >= 4) {
4982 Py_UCS4 bom = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
4983 if (bom == 0x0000FEFF) {
4984 bo = -1;
4985 q += 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00004986 }
Victor Stinnere64322e2012-10-30 23:12:47 +01004987 else if (bom == 0xFFFE0000) {
4988 bo = 1;
4989 q += 4;
4990 }
4991 if (byteorder)
4992 *byteorder = bo;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004993 }
4994
Victor Stinnere64322e2012-10-30 23:12:47 +01004995 if (q == e) {
4996 if (consumed)
4997 *consumed = size;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004998 _Py_RETURN_UNICODE_EMPTY();
Walter Dörwald41980ca2007-08-16 21:55:45 +00004999 }
5000
Victor Stinnere64322e2012-10-30 23:12:47 +01005001#ifdef WORDS_BIGENDIAN
5002 le = bo < 0;
5003#else
5004 le = bo <= 0;
5005#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005006 encoding = le ? "utf-32-le" : "utf-32-be";
Victor Stinnere64322e2012-10-30 23:12:47 +01005007
Victor Stinner8f674cc2013-04-17 23:02:17 +02005008 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005009 writer.min_length = (e - q + 3) / 4;
5010 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005011 goto onError;
Victor Stinnere64322e2012-10-30 23:12:47 +01005012
Victor Stinnere64322e2012-10-30 23:12:47 +01005013 while (1) {
5014 Py_UCS4 ch = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005015 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005016
Victor Stinnere64322e2012-10-30 23:12:47 +01005017 if (e - q >= 4) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005018 enum PyUnicode_Kind kind = writer.kind;
5019 void *data = writer.data;
Victor Stinnere64322e2012-10-30 23:12:47 +01005020 const unsigned char *last = e - 4;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005021 Py_ssize_t pos = writer.pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005022 if (le) {
5023 do {
5024 ch = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5025 if (ch > maxch)
5026 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005027 if (kind != PyUnicode_1BYTE_KIND &&
5028 Py_UNICODE_IS_SURROGATE(ch))
5029 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005030 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005031 q += 4;
5032 } while (q <= last);
5033 }
5034 else {
5035 do {
5036 ch = (q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
5037 if (ch > maxch)
5038 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005039 if (kind != PyUnicode_1BYTE_KIND &&
5040 Py_UNICODE_IS_SURROGATE(ch))
5041 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005042 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005043 q += 4;
5044 } while (q <= last);
5045 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005046 writer.pos = pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005047 }
5048
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005049 if (Py_UNICODE_IS_SURROGATE(ch)) {
5050 errmsg = "codepoint in surrogate code point range(0xd800, 0xe000)";
5051 startinpos = ((const char *)q) - starts;
5052 endinpos = startinpos + 4;
5053 }
5054 else if (ch <= maxch) {
Victor Stinnere64322e2012-10-30 23:12:47 +01005055 if (q == e || consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005056 break;
Victor Stinnere64322e2012-10-30 23:12:47 +01005057 /* remaining bytes at the end? (size should be divisible by 4) */
Benjamin Peterson29060642009-01-31 22:14:21 +00005058 errmsg = "truncated data";
Victor Stinnere64322e2012-10-30 23:12:47 +01005059 startinpos = ((const char *)q) - starts;
5060 endinpos = ((const char *)e) - starts;
Benjamin Peterson29060642009-01-31 22:14:21 +00005061 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005062 else {
5063 if (ch < 0x110000) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005064 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnere64322e2012-10-30 23:12:47 +01005065 goto onError;
5066 q += 4;
5067 continue;
5068 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005069 errmsg = "codepoint not in range(0x110000)";
Victor Stinnere64322e2012-10-30 23:12:47 +01005070 startinpos = ((const char *)q) - starts;
5071 endinpos = startinpos + 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005072 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005073
5074 /* The remaining input chars are ignored if the callback
5075 chooses to skip the input */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005076 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005077 errors, &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005078 encoding, errmsg,
Benjamin Peterson29060642009-01-31 22:14:21 +00005079 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005080 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005081 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005082 }
5083
Walter Dörwald41980ca2007-08-16 21:55:45 +00005084 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005085 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005086
Walter Dörwald41980ca2007-08-16 21:55:45 +00005087 Py_XDECREF(errorHandler);
5088 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005089 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005090
Benjamin Peterson29060642009-01-31 22:14:21 +00005091 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005092 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005093 Py_XDECREF(errorHandler);
5094 Py_XDECREF(exc);
5095 return NULL;
5096}
5097
5098PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005099_PyUnicode_EncodeUTF32(PyObject *str,
5100 const char *errors,
5101 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005102{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005103 int kind;
5104 void *data;
5105 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005106 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005107 unsigned char *p;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005108 Py_ssize_t nsize, i;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005109 /* Offsets from p for storing byte pairs in the right order. */
Christian Heimes743e0cd2012-10-17 23:52:17 +02005110#if PY_LITTLE_ENDIAN
Walter Dörwald41980ca2007-08-16 21:55:45 +00005111 int iorder[] = {0, 1, 2, 3};
5112#else
5113 int iorder[] = {3, 2, 1, 0};
5114#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005115 const char *encoding;
5116 PyObject *errorHandler = NULL;
5117 PyObject *exc = NULL;
5118 PyObject *rep = NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005119
Benjamin Peterson29060642009-01-31 22:14:21 +00005120#define STORECHAR(CH) \
5121 do { \
5122 p[iorder[3]] = ((CH) >> 24) & 0xff; \
5123 p[iorder[2]] = ((CH) >> 16) & 0xff; \
5124 p[iorder[1]] = ((CH) >> 8) & 0xff; \
5125 p[iorder[0]] = (CH) & 0xff; \
5126 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00005127 } while(0)
5128
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005129 if (!PyUnicode_Check(str)) {
5130 PyErr_BadArgument();
5131 return NULL;
5132 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005133 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005134 return NULL;
5135 kind = PyUnicode_KIND(str);
5136 data = PyUnicode_DATA(str);
5137 len = PyUnicode_GET_LENGTH(str);
5138
5139 nsize = len + (byteorder == 0);
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005140 if (nsize > PY_SSIZE_T_MAX / 4)
Benjamin Peterson29060642009-01-31 22:14:21 +00005141 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005142 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005143 if (v == NULL)
5144 return NULL;
5145
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005146 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005147 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005148 STORECHAR(0xFEFF);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005149 if (len == 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005150 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005151
5152 if (byteorder == -1) {
5153 /* force LE */
5154 iorder[0] = 0;
5155 iorder[1] = 1;
5156 iorder[2] = 2;
5157 iorder[3] = 3;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005158 encoding = "utf-32-le";
Walter Dörwald41980ca2007-08-16 21:55:45 +00005159 }
5160 else if (byteorder == 1) {
5161 /* force BE */
5162 iorder[0] = 3;
5163 iorder[1] = 2;
5164 iorder[2] = 1;
5165 iorder[3] = 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005166 encoding = "utf-32-be";
5167 }
5168 else
5169 encoding = "utf-32";
5170
5171 if (kind == PyUnicode_1BYTE_KIND) {
5172 for (i = 0; i < len; i++)
5173 STORECHAR(PyUnicode_READ(kind, data, i));
5174 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005175 }
5176
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005177 for (i = 0; i < len;) {
5178 Py_ssize_t repsize, moreunits;
5179 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
5180 i++;
5181 assert(ch <= MAX_UNICODE);
5182 if (!Py_UNICODE_IS_SURROGATE(ch)) {
5183 STORECHAR(ch);
5184 continue;
5185 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005186
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005187 rep = unicode_encode_call_errorhandler(
5188 errors, &errorHandler,
5189 encoding, "surrogates not allowed",
5190 str, &exc, i-1, i, &i);
5191
5192 if (!rep)
5193 goto error;
5194
5195 if (PyBytes_Check(rep)) {
5196 repsize = PyBytes_GET_SIZE(rep);
5197 if (repsize & 3) {
5198 raise_encode_exception(&exc, encoding,
5199 str, i - 1, i,
5200 "surrogates not allowed");
5201 goto error;
5202 }
5203 moreunits = repsize / 4;
5204 }
5205 else {
5206 assert(PyUnicode_Check(rep));
5207 if (PyUnicode_READY(rep) < 0)
5208 goto error;
5209 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5210 if (!PyUnicode_IS_ASCII(rep)) {
5211 raise_encode_exception(&exc, encoding,
5212 str, i - 1, i,
5213 "surrogates not allowed");
5214 goto error;
5215 }
5216 }
5217
5218 /* four bytes are reserved for each surrogate */
5219 if (moreunits > 1) {
5220 Py_ssize_t outpos = p - (unsigned char*) PyBytes_AS_STRING(v);
5221 Py_ssize_t morebytes = 4 * (moreunits - 1);
5222 if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) {
5223 /* integer overflow */
5224 PyErr_NoMemory();
5225 goto error;
5226 }
5227 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0)
5228 goto error;
5229 p = (unsigned char*) PyBytes_AS_STRING(v) + outpos;
5230 }
5231
5232 if (PyBytes_Check(rep)) {
5233 Py_MEMCPY(p, PyBytes_AS_STRING(rep), repsize);
5234 p += repsize;
5235 } else /* rep is unicode */ {
5236 const Py_UCS1 *repdata;
5237 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
5238 repdata = PyUnicode_1BYTE_DATA(rep);
5239 while (repsize--) {
5240 Py_UCS4 ch = *repdata++;
5241 STORECHAR(ch);
5242 }
5243 }
5244
5245 Py_CLEAR(rep);
5246 }
5247
5248 /* Cut back to size actually needed. This is necessary for, for example,
5249 encoding of a string containing isolated surrogates and the 'ignore'
5250 handler is used. */
5251 nsize = p - (unsigned char*) PyBytes_AS_STRING(v);
5252 if (nsize != PyBytes_GET_SIZE(v))
5253 _PyBytes_Resize(&v, nsize);
5254 Py_XDECREF(errorHandler);
5255 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005256 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005257 error:
5258 Py_XDECREF(rep);
5259 Py_XDECREF(errorHandler);
5260 Py_XDECREF(exc);
5261 Py_XDECREF(v);
5262 return NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005263#undef STORECHAR
5264}
5265
Alexander Belopolsky40018472011-02-26 01:02:56 +00005266PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005267PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5268 Py_ssize_t size,
5269 const char *errors,
5270 int byteorder)
5271{
5272 PyObject *result;
5273 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5274 if (tmp == NULL)
5275 return NULL;
5276 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5277 Py_DECREF(tmp);
5278 return result;
5279}
5280
5281PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005282PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005283{
Victor Stinnerb960b342011-11-20 19:12:52 +01005284 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005285}
5286
Guido van Rossumd57fd912000-03-10 22:53:23 +00005287/* --- UTF-16 Codec ------------------------------------------------------- */
5288
Tim Peters772747b2001-08-09 22:21:55 +00005289PyObject *
5290PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005291 Py_ssize_t size,
5292 const char *errors,
5293 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005294{
Walter Dörwald69652032004-09-07 20:24:22 +00005295 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5296}
5297
5298PyObject *
5299PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005300 Py_ssize_t size,
5301 const char *errors,
5302 int *byteorder,
5303 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005304{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005305 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005306 Py_ssize_t startinpos;
5307 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005308 _PyUnicodeWriter writer;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005309 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005310 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005311 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005312 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005313 PyObject *errorHandler = NULL;
5314 PyObject *exc = NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005315 const char *encoding;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005316
Tim Peters772747b2001-08-09 22:21:55 +00005317 q = (unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005318 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005319
5320 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005321 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005322
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005323 /* Check for BOM marks (U+FEFF) in the input and adjust current
5324 byte order setting accordingly. In native mode, the leading BOM
5325 mark is skipped, in all other modes, it is copied to the output
5326 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005327 if (bo == 0 && size >= 2) {
5328 const Py_UCS4 bom = (q[1] << 8) | q[0];
5329 if (bom == 0xFEFF) {
5330 q += 2;
5331 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005332 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005333 else if (bom == 0xFFFE) {
5334 q += 2;
5335 bo = 1;
5336 }
5337 if (byteorder)
5338 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005339 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005340
Antoine Pitrou63065d72012-05-15 23:48:04 +02005341 if (q == e) {
5342 if (consumed)
5343 *consumed = size;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005344 _Py_RETURN_UNICODE_EMPTY();
Tim Peters772747b2001-08-09 22:21:55 +00005345 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005346
Christian Heimes743e0cd2012-10-17 23:52:17 +02005347#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02005348 native_ordering = bo <= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005349 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
Antoine Pitrouab868312009-01-10 15:40:25 +00005350#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02005351 native_ordering = bo >= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005352 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
Antoine Pitrouab868312009-01-10 15:40:25 +00005353#endif
Tim Peters772747b2001-08-09 22:21:55 +00005354
Antoine Pitrou63065d72012-05-15 23:48:04 +02005355 /* Note: size will always be longer than the resulting Unicode
5356 character count */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005357 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005358 writer.min_length = (e - q + 1) / 2;
5359 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005360 goto onError;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005361
Antoine Pitrou63065d72012-05-15 23:48:04 +02005362 while (1) {
5363 Py_UCS4 ch = 0;
5364 if (e - q >= 2) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005365 int kind = writer.kind;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005366 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005367 if (PyUnicode_IS_ASCII(writer.buffer))
Antoine Pitrou63065d72012-05-15 23:48:04 +02005368 ch = asciilib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005369 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005370 native_ordering);
5371 else
5372 ch = ucs1lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005373 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005374 native_ordering);
5375 } else if (kind == PyUnicode_2BYTE_KIND) {
5376 ch = ucs2lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005377 (Py_UCS2*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005378 native_ordering);
5379 } else {
5380 assert(kind == PyUnicode_4BYTE_KIND);
5381 ch = ucs4lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005382 (Py_UCS4*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005383 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00005384 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005385 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005386
Antoine Pitrou63065d72012-05-15 23:48:04 +02005387 switch (ch)
5388 {
5389 case 0:
5390 /* remaining byte at the end? (size should be even) */
5391 if (q == e || consumed)
5392 goto End;
5393 errmsg = "truncated data";
5394 startinpos = ((const char *)q) - starts;
5395 endinpos = ((const char *)e) - starts;
5396 break;
5397 /* The remaining input chars are ignored if the callback
5398 chooses to skip the input */
5399 case 1:
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005400 q -= 2;
5401 if (consumed)
Serhiy Storchakaae3b32a2013-01-08 23:40:52 +02005402 goto End;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005403 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005404 startinpos = ((const char *)q) - starts;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005405 endinpos = ((const char *)e) - starts;
5406 break;
5407 case 2:
5408 errmsg = "illegal encoding";
5409 startinpos = ((const char *)q) - 2 - starts;
5410 endinpos = startinpos + 2;
5411 break;
5412 case 3:
5413 errmsg = "illegal UTF-16 surrogate";
5414 startinpos = ((const char *)q) - 4 - starts;
5415 endinpos = startinpos + 2;
5416 break;
5417 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005418 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005419 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005420 continue;
5421 }
5422
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005423 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouab868312009-01-10 15:40:25 +00005424 errors,
5425 &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005426 encoding, errmsg,
Antoine Pitrouab868312009-01-10 15:40:25 +00005427 &starts,
5428 (const char **)&e,
5429 &startinpos,
5430 &endinpos,
5431 &exc,
5432 (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005433 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005434 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005435 }
5436
Antoine Pitrou63065d72012-05-15 23:48:04 +02005437End:
Walter Dörwald69652032004-09-07 20:24:22 +00005438 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005439 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005440
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005441 Py_XDECREF(errorHandler);
5442 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005443 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005444
Benjamin Peterson29060642009-01-31 22:14:21 +00005445 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005446 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005447 Py_XDECREF(errorHandler);
5448 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005449 return NULL;
5450}
5451
Tim Peters772747b2001-08-09 22:21:55 +00005452PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005453_PyUnicode_EncodeUTF16(PyObject *str,
5454 const char *errors,
5455 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005456{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005457 enum PyUnicode_Kind kind;
5458 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005459 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005460 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005461 unsigned short *out;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005462 Py_ssize_t pairs;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005463#if PY_BIG_ENDIAN
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005464 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005465#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005466 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005467#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005468 const char *encoding;
5469 Py_ssize_t nsize, pos;
5470 PyObject *errorHandler = NULL;
5471 PyObject *exc = NULL;
5472 PyObject *rep = NULL;
Tim Peters772747b2001-08-09 22:21:55 +00005473
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005474 if (!PyUnicode_Check(str)) {
5475 PyErr_BadArgument();
5476 return NULL;
5477 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005478 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005479 return NULL;
5480 kind = PyUnicode_KIND(str);
5481 data = PyUnicode_DATA(str);
5482 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005483
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005484 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005485 if (kind == PyUnicode_4BYTE_KIND) {
5486 const Py_UCS4 *in = (const Py_UCS4 *)data;
5487 const Py_UCS4 *end = in + len;
5488 while (in < end)
5489 if (*in++ >= 0x10000)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005490 pairs++;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005491 }
5492 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005493 return PyErr_NoMemory();
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005494 nsize = len + pairs + (byteorder == 0);
5495 v = PyBytes_FromStringAndSize(NULL, nsize * 2);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005496 if (v == NULL)
5497 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005498
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005499 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02005500 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005501 out = (unsigned short *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005502 if (byteorder == 0)
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005503 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005504 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005505 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005506
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005507 if (kind == PyUnicode_1BYTE_KIND) {
5508 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5509 goto done;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005510 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005511
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005512 if (byteorder < 0)
5513 encoding = "utf-16-le";
5514 else if (byteorder > 0)
5515 encoding = "utf-16-be";
5516 else
5517 encoding = "utf-16";
5518
5519 pos = 0;
5520 while (pos < len) {
5521 Py_ssize_t repsize, moreunits;
5522
5523 if (kind == PyUnicode_2BYTE_KIND) {
5524 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
5525 &out, native_ordering);
5526 }
5527 else {
5528 assert(kind == PyUnicode_4BYTE_KIND);
5529 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
5530 &out, native_ordering);
5531 }
5532 if (pos == len)
5533 break;
5534
5535 rep = unicode_encode_call_errorhandler(
5536 errors, &errorHandler,
5537 encoding, "surrogates not allowed",
5538 str, &exc, pos, pos + 1, &pos);
5539 if (!rep)
5540 goto error;
5541
5542 if (PyBytes_Check(rep)) {
5543 repsize = PyBytes_GET_SIZE(rep);
5544 if (repsize & 1) {
5545 raise_encode_exception(&exc, encoding,
5546 str, pos - 1, pos,
5547 "surrogates not allowed");
5548 goto error;
5549 }
5550 moreunits = repsize / 2;
5551 }
5552 else {
5553 assert(PyUnicode_Check(rep));
5554 if (PyUnicode_READY(rep) < 0)
5555 goto error;
5556 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5557 if (!PyUnicode_IS_ASCII(rep)) {
5558 raise_encode_exception(&exc, encoding,
5559 str, pos - 1, pos,
5560 "surrogates not allowed");
5561 goto error;
5562 }
5563 }
5564
5565 /* two bytes are reserved for each surrogate */
5566 if (moreunits > 1) {
5567 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
5568 Py_ssize_t morebytes = 2 * (moreunits - 1);
5569 if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) {
5570 /* integer overflow */
5571 PyErr_NoMemory();
5572 goto error;
5573 }
5574 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0)
5575 goto error;
5576 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
5577 }
5578
5579 if (PyBytes_Check(rep)) {
5580 Py_MEMCPY(out, PyBytes_AS_STRING(rep), repsize);
5581 out += moreunits;
5582 } else /* rep is unicode */ {
5583 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
5584 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5585 &out, native_ordering);
5586 }
5587
5588 Py_CLEAR(rep);
5589 }
5590
5591 /* Cut back to size actually needed. This is necessary for, for example,
5592 encoding of a string containing isolated surrogates and the 'ignore' handler
5593 is used. */
5594 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
5595 if (nsize != PyBytes_GET_SIZE(v))
5596 _PyBytes_Resize(&v, nsize);
5597 Py_XDECREF(errorHandler);
5598 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00005599 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005600 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005601 error:
5602 Py_XDECREF(rep);
5603 Py_XDECREF(errorHandler);
5604 Py_XDECREF(exc);
5605 Py_XDECREF(v);
5606 return NULL;
5607#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005608}
5609
Alexander Belopolsky40018472011-02-26 01:02:56 +00005610PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005611PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5612 Py_ssize_t size,
5613 const char *errors,
5614 int byteorder)
5615{
5616 PyObject *result;
5617 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5618 if (tmp == NULL)
5619 return NULL;
5620 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5621 Py_DECREF(tmp);
5622 return result;
5623}
5624
5625PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005626PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005627{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005628 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005629}
5630
5631/* --- Unicode Escape Codec ----------------------------------------------- */
5632
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005633/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5634 if all the escapes in the string make it still a valid ASCII string.
5635 Returns -1 if any escapes were found which cause the string to
5636 pop out of ASCII range. Otherwise returns the length of the
5637 required buffer to hold the string.
5638 */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02005639static Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005640length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5641{
5642 const unsigned char *p = (const unsigned char *)s;
5643 const unsigned char *end = p + size;
5644 Py_ssize_t length = 0;
5645
5646 if (size < 0)
5647 return -1;
5648
5649 for (; p < end; ++p) {
5650 if (*p > 127) {
5651 /* Non-ASCII */
5652 return -1;
5653 }
5654 else if (*p != '\\') {
5655 /* Normal character */
5656 ++length;
5657 }
5658 else {
5659 /* Backslash-escape, check next char */
5660 ++p;
5661 /* Escape sequence reaches till end of string or
5662 non-ASCII follow-up. */
5663 if (p >= end || *p > 127)
5664 return -1;
5665 switch (*p) {
5666 case '\n':
5667 /* backslash + \n result in zero characters */
5668 break;
5669 case '\\': case '\'': case '\"':
5670 case 'b': case 'f': case 't':
5671 case 'n': case 'r': case 'v': case 'a':
5672 ++length;
5673 break;
5674 case '0': case '1': case '2': case '3':
5675 case '4': case '5': case '6': case '7':
5676 case 'x': case 'u': case 'U': case 'N':
5677 /* these do not guarantee ASCII characters */
5678 return -1;
5679 default:
5680 /* count the backslash + the other character */
5681 length += 2;
5682 }
5683 }
5684 }
5685 return length;
5686}
5687
Fredrik Lundh06d12682001-01-24 07:59:11 +00005688static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005689
Alexander Belopolsky40018472011-02-26 01:02:56 +00005690PyObject *
5691PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005692 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005693 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005694{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005695 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005696 Py_ssize_t startinpos;
5697 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005698 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005699 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005700 char* message;
5701 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005702 PyObject *errorHandler = NULL;
5703 PyObject *exc = NULL;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005704 Py_ssize_t len;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005705
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005706 len = length_of_escaped_ascii_string(s, size);
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005707 if (len == 0)
5708 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005709
5710 /* After length_of_escaped_ascii_string() there are two alternatives,
5711 either the string is pure ASCII with named escapes like \n, etc.
5712 and we determined it's exact size (common case)
5713 or it contains \x, \u, ... escape sequences. then we create a
5714 legacy wchar string and resize it at the end of this function. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005715 _PyUnicodeWriter_Init(&writer);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005716 if (len > 0) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02005717 writer.min_length = len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005718 }
5719 else {
5720 /* Escaped strings will always be longer than the resulting
5721 Unicode string, so we start with size here and then reduce the
5722 length after conversion to the true value.
5723 (but if the error callback returns a long replacement string
5724 we'll have to allocate more space) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005725 writer.min_length = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005726 }
5727
Guido van Rossumd57fd912000-03-10 22:53:23 +00005728 if (size == 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005729 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005730 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005731
Guido van Rossumd57fd912000-03-10 22:53:23 +00005732 while (s < end) {
5733 unsigned char c;
Victor Stinner24729f32011-11-10 20:31:37 +01005734 Py_UCS4 x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005735 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005736
5737 /* Non-escape characters are interpreted as Unicode ordinals */
5738 if (*s != '\\') {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005739 x = (unsigned char)*s;
5740 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005741 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005742 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005743 continue;
5744 }
5745
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005746 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005747 /* \ - Escapes */
5748 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005749 c = *s++;
5750 if (s > end)
5751 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005752
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005753 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005754
Benjamin Peterson29060642009-01-31 22:14:21 +00005755 /* \x escapes */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005756#define WRITECHAR(ch) \
5757 do { \
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005758 if (_PyUnicodeWriter_WriteCharInline(&writer, (ch)) < 0) \
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005759 goto onError; \
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005760 } while(0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005761
Guido van Rossumd57fd912000-03-10 22:53:23 +00005762 case '\n': break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005763 case '\\': WRITECHAR('\\'); break;
5764 case '\'': WRITECHAR('\''); break;
5765 case '\"': WRITECHAR('\"'); break;
5766 case 'b': WRITECHAR('\b'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005767 /* FF */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005768 case 'f': WRITECHAR('\014'); break;
5769 case 't': WRITECHAR('\t'); break;
5770 case 'n': WRITECHAR('\n'); break;
5771 case 'r': WRITECHAR('\r'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005772 /* VT */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005773 case 'v': WRITECHAR('\013'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005774 /* BEL, not classic C */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005775 case 'a': WRITECHAR('\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005776
Benjamin Peterson29060642009-01-31 22:14:21 +00005777 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005778 case '0': case '1': case '2': case '3':
5779 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005780 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005781 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005782 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005783 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005784 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005785 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005786 WRITECHAR(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005787 break;
5788
Benjamin Peterson29060642009-01-31 22:14:21 +00005789 /* hex escapes */
5790 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005791 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005792 digits = 2;
5793 message = "truncated \\xXX escape";
5794 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005795
Benjamin Peterson29060642009-01-31 22:14:21 +00005796 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005797 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005798 digits = 4;
5799 message = "truncated \\uXXXX escape";
5800 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005801
Benjamin Peterson29060642009-01-31 22:14:21 +00005802 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005803 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005804 digits = 8;
5805 message = "truncated \\UXXXXXXXX escape";
5806 hexescape:
5807 chr = 0;
Serhiy Storchakad6793772013-01-29 10:20:44 +02005808 if (end - s < digits) {
5809 /* count only hex digits */
5810 for (; s < end; ++s) {
5811 c = (unsigned char)*s;
5812 if (!Py_ISXDIGIT(c))
5813 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005814 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02005815 goto error;
5816 }
5817 for (; digits--; ++s) {
5818 c = (unsigned char)*s;
5819 if (!Py_ISXDIGIT(c))
5820 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005821 chr = (chr<<4) & ~0xF;
5822 if (c >= '0' && c <= '9')
5823 chr += c - '0';
5824 else if (c >= 'a' && c <= 'f')
5825 chr += 10 + c - 'a';
5826 else
5827 chr += 10 + c - 'A';
5828 }
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005829 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005830 /* _decoding_error will have already written into the
5831 target buffer. */
5832 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005833 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005834 /* when we get here, chr is a 32-bit unicode character */
Serhiy Storchaka24193de2013-01-29 10:28:07 +02005835 message = "illegal Unicode character";
5836 if (chr > MAX_UNICODE)
Serhiy Storchakad6793772013-01-29 10:20:44 +02005837 goto error;
Serhiy Storchaka24193de2013-01-29 10:28:07 +02005838 WRITECHAR(chr);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005839 break;
5840
Benjamin Peterson29060642009-01-31 22:14:21 +00005841 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005842 case 'N':
5843 message = "malformed \\N character escape";
5844 if (ucnhash_CAPI == NULL) {
5845 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005846 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5847 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005848 if (ucnhash_CAPI == NULL)
5849 goto ucnhashError;
5850 }
5851 if (*s == '{') {
5852 const char *start = s+1;
5853 /* look for the closing brace */
5854 while (*s != '}' && s < end)
5855 s++;
5856 if (s > start && s < end && *s == '}') {
5857 /* found a name. look it up in the unicode database */
5858 message = "unknown Unicode character name";
5859 s++;
Serhiy Storchaka4f5f0e52013-01-21 11:38:00 +02005860 if (s - start - 1 <= INT_MAX &&
Serhiy Storchakac35f3a92013-01-21 11:42:57 +02005861 ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
Ezio Melotti931b8aa2011-10-21 21:57:36 +03005862 &chr, 0))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005863 goto store;
5864 }
5865 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02005866 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005867
5868 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005869 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005870 message = "\\ at end of string";
5871 s--;
Serhiy Storchakad6793772013-01-29 10:20:44 +02005872 goto error;
Walter Dörwald8c077222002-03-25 11:16:18 +00005873 }
5874 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005875 WRITECHAR('\\');
Serhiy Storchaka73e38802013-01-25 23:52:21 +02005876 WRITECHAR((unsigned char)s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005877 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005878 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005879 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02005880 continue;
5881
5882 error:
5883 endinpos = s-starts;
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02005884 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchakad6793772013-01-29 10:20:44 +02005885 errors, &errorHandler,
5886 "unicodeescape", message,
5887 &starts, &end, &startinpos, &endinpos, &exc, &s,
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02005888 &writer))
Serhiy Storchakad6793772013-01-29 10:20:44 +02005889 goto onError;
5890 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005891 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005892#undef WRITECHAR
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005893
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005894 Py_XDECREF(errorHandler);
5895 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005896 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald8c077222002-03-25 11:16:18 +00005897
Benjamin Peterson29060642009-01-31 22:14:21 +00005898 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005899 PyErr_SetString(
5900 PyExc_UnicodeError,
5901 "\\N escapes not supported (can't load unicodedata module)"
5902 );
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005903 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005904 Py_XDECREF(errorHandler);
5905 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005906 return NULL;
5907
Benjamin Peterson29060642009-01-31 22:14:21 +00005908 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005909 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005910 Py_XDECREF(errorHandler);
5911 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005912 return NULL;
5913}
5914
5915/* Return a Unicode-Escape string version of the Unicode object.
5916
5917 If quotes is true, the string is enclosed in u"" or u'' quotes as
5918 appropriate.
5919
5920*/
5921
Alexander Belopolsky40018472011-02-26 01:02:56 +00005922PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005923PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005924{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005925 Py_ssize_t i, len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005926 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005927 char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005928 int kind;
5929 void *data;
5930 Py_ssize_t expandsize = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005931
Ezio Melottie7f90372012-10-05 03:33:31 +03005932 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00005933 escape.
5934
Ezio Melottie7f90372012-10-05 03:33:31 +03005935 For UCS1 strings it's '\xxx', 4 bytes per source character.
5936 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
5937 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00005938 */
5939
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005940 if (!PyUnicode_Check(unicode)) {
5941 PyErr_BadArgument();
5942 return NULL;
5943 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005944 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005945 return NULL;
5946 len = PyUnicode_GET_LENGTH(unicode);
5947 kind = PyUnicode_KIND(unicode);
5948 data = PyUnicode_DATA(unicode);
Benjamin Petersonead6b532011-12-20 17:23:42 -06005949 switch (kind) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005950 case PyUnicode_1BYTE_KIND: expandsize = 4; break;
5951 case PyUnicode_2BYTE_KIND: expandsize = 6; break;
5952 case PyUnicode_4BYTE_KIND: expandsize = 10; break;
5953 }
5954
5955 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005956 return PyBytes_FromStringAndSize(NULL, 0);
5957
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005958 if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005959 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005960
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005961 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005962 2
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005963 + expandsize*len
Benjamin Peterson29060642009-01-31 22:14:21 +00005964 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005965 if (repr == NULL)
5966 return NULL;
5967
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005968 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005969
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005970 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01005971 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005972
Walter Dörwald79e913e2007-05-12 11:08:06 +00005973 /* Escape backslashes */
5974 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005975 *p++ = '\\';
5976 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005977 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005978 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005979
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005980 /* Map 21-bit characters to '\U00xxxxxx' */
5981 else if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01005982 assert(ch <= MAX_UNICODE);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005983 *p++ = '\\';
5984 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005985 *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
5986 *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F];
5987 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
5988 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
5989 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
5990 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
5991 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
5992 *p++ = Py_hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00005993 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005994 }
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005995
Guido van Rossumd57fd912000-03-10 22:53:23 +00005996 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005997 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005998 *p++ = '\\';
5999 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006000 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6001 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6002 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6003 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006004 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006005
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006006 /* Map special whitespace to '\t', \n', '\r' */
6007 else if (ch == '\t') {
6008 *p++ = '\\';
6009 *p++ = 't';
6010 }
6011 else if (ch == '\n') {
6012 *p++ = '\\';
6013 *p++ = 'n';
6014 }
6015 else if (ch == '\r') {
6016 *p++ = '\\';
6017 *p++ = 'r';
6018 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006019
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006020 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00006021 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006022 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006023 *p++ = 'x';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006024 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6025 *p++ = Py_hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00006026 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006027
Guido van Rossumd57fd912000-03-10 22:53:23 +00006028 /* Copy everything else as-is */
6029 else
6030 *p++ = (char) ch;
6031 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006032
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006033 assert(p - PyBytes_AS_STRING(repr) > 0);
6034 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
6035 return NULL;
6036 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006037}
6038
Alexander Belopolsky40018472011-02-26 01:02:56 +00006039PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006040PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6041 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006042{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006043 PyObject *result;
6044 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6045 if (tmp == NULL)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006046 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006047 result = PyUnicode_AsUnicodeEscapeString(tmp);
6048 Py_DECREF(tmp);
6049 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006050}
6051
6052/* --- Raw Unicode Escape Codec ------------------------------------------- */
6053
Alexander Belopolsky40018472011-02-26 01:02:56 +00006054PyObject *
6055PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006056 Py_ssize_t size,
6057 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006058{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006059 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006060 Py_ssize_t startinpos;
6061 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006062 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006063 const char *end;
6064 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006065 PyObject *errorHandler = NULL;
6066 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006067
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006068 if (size == 0)
6069 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006070
Guido van Rossumd57fd912000-03-10 22:53:23 +00006071 /* Escaped strings will always be longer than the resulting
6072 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006073 length after conversion to the true value. (But decoding error
6074 handler might have to resize the string) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006075 _PyUnicodeWriter_Init(&writer);
6076 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006077
Guido van Rossumd57fd912000-03-10 22:53:23 +00006078 end = s + size;
6079 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006080 unsigned char c;
6081 Py_UCS4 x;
6082 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006083 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006084
Benjamin Peterson29060642009-01-31 22:14:21 +00006085 /* Non-escape characters are interpreted as Unicode ordinals */
6086 if (*s != '\\') {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006087 x = (unsigned char)*s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006088 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006089 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006090 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006091 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006092 startinpos = s-starts;
6093
6094 /* \u-escapes are only interpreted iff the number of leading
6095 backslashes if odd */
6096 bs = s;
6097 for (;s < end;) {
6098 if (*s != '\\')
6099 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006100 x = (unsigned char)*s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006101 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006102 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006103 }
6104 if (((s - bs) & 1) == 0 ||
6105 s >= end ||
6106 (*s != 'u' && *s != 'U')) {
6107 continue;
6108 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006109 writer.pos--;
Benjamin Peterson29060642009-01-31 22:14:21 +00006110 count = *s=='u' ? 4 : 8;
6111 s++;
6112
6113 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Benjamin Peterson29060642009-01-31 22:14:21 +00006114 for (x = 0, i = 0; i < count; ++i, ++s) {
6115 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00006116 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006117 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006118 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00006119 errors, &errorHandler,
6120 "rawunicodeescape", "truncated \\uXXXX",
6121 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006122 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006123 goto onError;
6124 goto nextByte;
6125 }
6126 x = (x<<4) & ~0xF;
6127 if (c >= '0' && c <= '9')
6128 x += c - '0';
6129 else if (c >= 'a' && c <= 'f')
6130 x += 10 + c - 'a';
6131 else
6132 x += 10 + c - 'A';
6133 }
Victor Stinner8faf8212011-12-08 22:14:11 +01006134 if (x <= MAX_UNICODE) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006135 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006136 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006137 }
6138 else {
Christian Heimesfe337bf2008-03-23 21:54:12 +00006139 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006140 if (unicode_decode_call_errorhandler_writer(
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006141 errors, &errorHandler,
6142 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00006143 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006144 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006145 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006146 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006147 nextByte:
6148 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006149 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006150 Py_XDECREF(errorHandler);
6151 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006152 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006153
Benjamin Peterson29060642009-01-31 22:14:21 +00006154 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006155 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006156 Py_XDECREF(errorHandler);
6157 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006158 return NULL;
6159}
6160
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006161
Alexander Belopolsky40018472011-02-26 01:02:56 +00006162PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006163PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006164{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006165 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006166 char *p;
6167 char *q;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006168 Py_ssize_t expandsize, pos;
6169 int kind;
6170 void *data;
6171 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006172
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006173 if (!PyUnicode_Check(unicode)) {
6174 PyErr_BadArgument();
6175 return NULL;
6176 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05006177 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006178 return NULL;
6179 kind = PyUnicode_KIND(unicode);
6180 data = PyUnicode_DATA(unicode);
6181 len = PyUnicode_GET_LENGTH(unicode);
Benjamin Peterson1518e872011-11-23 10:44:52 -06006182 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6183 bytes, and 1 byte characters 4. */
6184 expandsize = kind * 2 + 2;
Victor Stinner0e368262011-11-10 20:12:49 +01006185
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006186 if (len > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006187 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00006188
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006189 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006190 if (repr == NULL)
6191 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006192 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006193 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006194
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006195 p = q = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006196 for (pos = 0; pos < len; pos++) {
6197 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006198 /* Map 32-bit characters to '\Uxxxxxxxx' */
6199 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01006200 assert(ch <= MAX_UNICODE);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006201 *p++ = '\\';
6202 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006203 *p++ = Py_hexdigits[(ch >> 28) & 0xf];
6204 *p++ = Py_hexdigits[(ch >> 24) & 0xf];
6205 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6206 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6207 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6208 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6209 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6210 *p++ = Py_hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00006211 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006212 /* Map 16-bit characters to '\uxxxx' */
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006213 else if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006214 *p++ = '\\';
6215 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006216 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6217 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6218 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6219 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006220 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006221 /* Copy everything else as-is */
6222 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00006223 *p++ = (char) ch;
6224 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006225
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006226 assert(p > q);
6227 if (_PyBytes_Resize(&repr, p - q) < 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006228 return NULL;
6229 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006230}
6231
Alexander Belopolsky40018472011-02-26 01:02:56 +00006232PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006233PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6234 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006235{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006236 PyObject *result;
6237 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6238 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006239 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006240 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6241 Py_DECREF(tmp);
6242 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006243}
6244
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006245/* --- Unicode Internal Codec ------------------------------------------- */
6246
Alexander Belopolsky40018472011-02-26 01:02:56 +00006247PyObject *
6248_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006249 Py_ssize_t size,
6250 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006251{
6252 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006253 Py_ssize_t startinpos;
6254 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006255 _PyUnicodeWriter writer;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006256 const char *end;
6257 const char *reason;
6258 PyObject *errorHandler = NULL;
6259 PyObject *exc = NULL;
6260
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006261 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02006262 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006263 1))
6264 return NULL;
6265
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006266 if (size == 0)
6267 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006268
Victor Stinner8f674cc2013-04-17 23:02:17 +02006269 _PyUnicodeWriter_Init(&writer);
6270 if (size / Py_UNICODE_SIZE > PY_SSIZE_T_MAX - 1) {
6271 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00006272 goto onError;
Victor Stinner8f674cc2013-04-17 23:02:17 +02006273 }
6274 writer.min_length = (size + (Py_UNICODE_SIZE - 1)) / Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006275
Victor Stinner8f674cc2013-04-17 23:02:17 +02006276 end = s + size;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006277 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006278 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006279 Py_UCS4 ch;
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006280 if (end - s < Py_UNICODE_SIZE) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006281 endinpos = end-starts;
6282 reason = "truncated input";
6283 goto error;
6284 }
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006285 /* We copy the raw representation one byte at a time because the
6286 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006287 ((char *) &uch)[0] = s[0];
6288 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006289#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006290 ((char *) &uch)[2] = s[2];
6291 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006292#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006293 ch = uch;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006294#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006295 /* We have to sanity check the raw data, otherwise doom looms for
6296 some malformed UCS-4 data. */
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006297 if (ch > 0x10ffff) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006298 endinpos = s - starts + Py_UNICODE_SIZE;
6299 reason = "illegal code point (> 0x10FFFF)";
6300 goto error;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006301 }
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006302#endif
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006303 s += Py_UNICODE_SIZE;
6304#ifndef Py_UNICODE_WIDE
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006305 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && end - s >= Py_UNICODE_SIZE)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006306 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006307 Py_UNICODE uch2;
6308 ((char *) &uch2)[0] = s[0];
6309 ((char *) &uch2)[1] = s[1];
Victor Stinner551ac952011-11-29 22:58:13 +01006310 if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006311 {
Victor Stinner551ac952011-11-29 22:58:13 +01006312 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006313 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006314 }
6315 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006316#endif
6317
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006318 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006319 goto onError;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006320 continue;
6321
6322 error:
6323 startinpos = s - starts;
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006324 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006325 errors, &errorHandler,
6326 "unicode_internal", reason,
6327 &starts, &end, &startinpos, &endinpos, &exc, &s,
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006328 &writer))
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006329 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006330 }
6331
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006332 Py_XDECREF(errorHandler);
6333 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006334 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006335
Benjamin Peterson29060642009-01-31 22:14:21 +00006336 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006337 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006338 Py_XDECREF(errorHandler);
6339 Py_XDECREF(exc);
6340 return NULL;
6341}
6342
Guido van Rossumd57fd912000-03-10 22:53:23 +00006343/* --- Latin-1 Codec ------------------------------------------------------ */
6344
Alexander Belopolsky40018472011-02-26 01:02:56 +00006345PyObject *
6346PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006347 Py_ssize_t size,
6348 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006349{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006350 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006351 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006352}
6353
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006354/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006355static void
6356make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006357 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006358 PyObject *unicode,
6359 Py_ssize_t startpos, Py_ssize_t endpos,
6360 const char *reason)
6361{
6362 if (*exceptionObject == NULL) {
6363 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006364 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006365 encoding, unicode, startpos, endpos, reason);
6366 }
6367 else {
6368 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6369 goto onError;
6370 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6371 goto onError;
6372 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6373 goto onError;
6374 return;
6375 onError:
6376 Py_DECREF(*exceptionObject);
6377 *exceptionObject = NULL;
6378 }
6379}
6380
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006381/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006382static void
6383raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006384 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006385 PyObject *unicode,
6386 Py_ssize_t startpos, Py_ssize_t endpos,
6387 const char *reason)
6388{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006389 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006390 encoding, unicode, startpos, endpos, reason);
6391 if (*exceptionObject != NULL)
6392 PyCodec_StrictErrors(*exceptionObject);
6393}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006394
6395/* error handling callback helper:
6396 build arguments, call the callback and check the arguments,
6397 put the result into newpos and return the replacement string, which
6398 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006399static PyObject *
6400unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006401 PyObject **errorHandler,
6402 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006403 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006404 Py_ssize_t startpos, Py_ssize_t endpos,
6405 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006406{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006407 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006408 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006409 PyObject *restuple;
6410 PyObject *resunicode;
6411
6412 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006413 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006414 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006415 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006416 }
6417
Benjamin Petersonbac79492012-01-14 13:34:47 -05006418 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006419 return NULL;
6420 len = PyUnicode_GET_LENGTH(unicode);
6421
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006422 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006423 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006424 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006425 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006426
6427 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006428 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006429 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006430 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006431 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006432 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006433 Py_DECREF(restuple);
6434 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006435 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006436 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006437 &resunicode, newpos)) {
6438 Py_DECREF(restuple);
6439 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006440 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006441 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6442 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6443 Py_DECREF(restuple);
6444 return NULL;
6445 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006446 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006447 *newpos = len + *newpos;
6448 if (*newpos<0 || *newpos>len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006449 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6450 Py_DECREF(restuple);
6451 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006452 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006453 Py_INCREF(resunicode);
6454 Py_DECREF(restuple);
6455 return resunicode;
6456}
6457
Alexander Belopolsky40018472011-02-26 01:02:56 +00006458static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006459unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006460 const char *errors,
Victor Stinnerfcd96532011-11-04 00:28:50 +01006461 unsigned int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006462{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006463 /* input state */
6464 Py_ssize_t pos=0, size;
6465 int kind;
6466 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006467 /* output object */
6468 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006469 /* pointer into the output */
6470 char *str;
6471 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006472 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006473 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6474 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006475 PyObject *errorHandler = NULL;
6476 PyObject *exc = NULL;
6477 /* the following variable is used for caching string comparisons
6478 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6479 int known_errorHandler = -1;
6480
Benjamin Petersonbac79492012-01-14 13:34:47 -05006481 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006482 return NULL;
6483 size = PyUnicode_GET_LENGTH(unicode);
6484 kind = PyUnicode_KIND(unicode);
6485 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006486 /* allocate enough for a simple encoding without
6487 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006488 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006489 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006490 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006491 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006492 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006493 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006494 ressize = size;
6495
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006496 while (pos < size) {
6497 Py_UCS4 c = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006498
Benjamin Peterson29060642009-01-31 22:14:21 +00006499 /* can we encode this? */
6500 if (c<limit) {
6501 /* no overflow check, because we know that the space is enough */
6502 *str++ = (char)c;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006503 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006504 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006505 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006506 Py_ssize_t requiredsize;
6507 PyObject *repunicode;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006508 Py_ssize_t repsize, newpos, respos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006509 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006510 Py_ssize_t collstart = pos;
6511 Py_ssize_t collend = pos;
Benjamin Peterson29060642009-01-31 22:14:21 +00006512 /* find all unecodable characters */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006513 while ((collend < size) && (PyUnicode_READ(kind, data, collend)>=limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006514 ++collend;
6515 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6516 if (known_errorHandler==-1) {
6517 if ((errors==NULL) || (!strcmp(errors, "strict")))
6518 known_errorHandler = 1;
6519 else if (!strcmp(errors, "replace"))
6520 known_errorHandler = 2;
6521 else if (!strcmp(errors, "ignore"))
6522 known_errorHandler = 3;
6523 else if (!strcmp(errors, "xmlcharrefreplace"))
6524 known_errorHandler = 4;
6525 else
6526 known_errorHandler = 0;
6527 }
6528 switch (known_errorHandler) {
6529 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006530 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006531 goto onError;
6532 case 2: /* replace */
6533 while (collstart++<collend)
6534 *str++ = '?'; /* fall through */
6535 case 3: /* ignore */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006536 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006537 break;
6538 case 4: /* xmlcharrefreplace */
6539 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006540 /* determine replacement size */
6541 for (i = collstart, repsize = 0; i < collend; ++i) {
6542 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6543 if (ch < 10)
Benjamin Peterson29060642009-01-31 22:14:21 +00006544 repsize += 2+1+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006545 else if (ch < 100)
Benjamin Peterson29060642009-01-31 22:14:21 +00006546 repsize += 2+2+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006547 else if (ch < 1000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006548 repsize += 2+3+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006549 else if (ch < 10000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006550 repsize += 2+4+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006551 else if (ch < 100000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006552 repsize += 2+5+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006553 else if (ch < 1000000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006554 repsize += 2+6+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006555 else {
Victor Stinner8faf8212011-12-08 22:14:11 +01006556 assert(ch <= MAX_UNICODE);
Benjamin Peterson29060642009-01-31 22:14:21 +00006557 repsize += 2+7+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006558 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006559 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006560 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006561 if (requiredsize > ressize) {
6562 if (requiredsize<2*ressize)
6563 requiredsize = 2*ressize;
6564 if (_PyBytes_Resize(&res, requiredsize))
6565 goto onError;
6566 str = PyBytes_AS_STRING(res) + respos;
6567 ressize = requiredsize;
6568 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006569 /* generate replacement */
6570 for (i = collstart; i < collend; ++i) {
6571 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
Benjamin Peterson29060642009-01-31 22:14:21 +00006572 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006573 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006574 break;
6575 default:
6576 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006577 encoding, reason, unicode, &exc,
6578 collstart, collend, &newpos);
6579 if (repunicode == NULL || (PyUnicode_Check(repunicode) &&
Benjamin Petersonbac79492012-01-14 13:34:47 -05006580 PyUnicode_READY(repunicode) == -1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006581 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006582 if (PyBytes_Check(repunicode)) {
6583 /* Directly copy bytes result to output. */
6584 repsize = PyBytes_Size(repunicode);
6585 if (repsize > 1) {
6586 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006587 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006588 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6589 Py_DECREF(repunicode);
6590 goto onError;
6591 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006592 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006593 ressize += repsize-1;
6594 }
6595 memcpy(str, PyBytes_AsString(repunicode), repsize);
6596 str += repsize;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006597 pos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006598 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006599 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006600 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006601 /* need more space? (at least enough for what we
6602 have+the replacement+the rest of the string, so
6603 we won't have to check space for encodable characters) */
6604 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006605 repsize = PyUnicode_GET_LENGTH(repunicode);
6606 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006607 if (requiredsize > ressize) {
6608 if (requiredsize<2*ressize)
6609 requiredsize = 2*ressize;
6610 if (_PyBytes_Resize(&res, requiredsize)) {
6611 Py_DECREF(repunicode);
6612 goto onError;
6613 }
6614 str = PyBytes_AS_STRING(res) + respos;
6615 ressize = requiredsize;
6616 }
6617 /* check if there is anything unencodable in the replacement
6618 and copy it to the output */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006619 for (i = 0; repsize-->0; ++i, ++str) {
6620 c = PyUnicode_READ_CHAR(repunicode, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00006621 if (c >= limit) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006622 raise_encode_exception(&exc, encoding, unicode,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006623 pos, pos+1, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006624 Py_DECREF(repunicode);
6625 goto onError;
6626 }
6627 *str = (char)c;
6628 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006629 pos = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006630 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006631 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006632 }
6633 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006634 /* Resize if we allocated to much */
6635 size = str - PyBytes_AS_STRING(res);
6636 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006637 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006638 if (_PyBytes_Resize(&res, size) < 0)
6639 goto onError;
6640 }
6641
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006642 Py_XDECREF(errorHandler);
6643 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006644 return res;
6645
6646 onError:
6647 Py_XDECREF(res);
6648 Py_XDECREF(errorHandler);
6649 Py_XDECREF(exc);
6650 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006651}
6652
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006653/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006654PyObject *
6655PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006656 Py_ssize_t size,
6657 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006658{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006659 PyObject *result;
6660 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6661 if (unicode == NULL)
6662 return NULL;
6663 result = unicode_encode_ucs1(unicode, errors, 256);
6664 Py_DECREF(unicode);
6665 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006666}
6667
Alexander Belopolsky40018472011-02-26 01:02:56 +00006668PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006669_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006670{
6671 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006672 PyErr_BadArgument();
6673 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006674 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006675 if (PyUnicode_READY(unicode) == -1)
6676 return NULL;
6677 /* Fast path: if it is a one-byte string, construct
6678 bytes object directly. */
6679 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6680 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6681 PyUnicode_GET_LENGTH(unicode));
6682 /* Non-Latin-1 characters present. Defer to above function to
6683 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006684 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006685}
6686
6687PyObject*
6688PyUnicode_AsLatin1String(PyObject *unicode)
6689{
6690 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006691}
6692
6693/* --- 7-bit ASCII Codec -------------------------------------------------- */
6694
Alexander Belopolsky40018472011-02-26 01:02:56 +00006695PyObject *
6696PyUnicode_DecodeASCII(const char *s,
6697 Py_ssize_t size,
6698 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006699{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006700 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006701 _PyUnicodeWriter writer;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006702 int kind;
6703 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006704 Py_ssize_t startinpos;
6705 Py_ssize_t endinpos;
6706 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006707 const char *e;
6708 PyObject *errorHandler = NULL;
6709 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006710
Guido van Rossumd57fd912000-03-10 22:53:23 +00006711 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02006712 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006713
Guido van Rossumd57fd912000-03-10 22:53:23 +00006714 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006715 if (size == 1 && (unsigned char)s[0] < 128)
6716 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006717
Victor Stinner8f674cc2013-04-17 23:02:17 +02006718 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02006719 writer.min_length = size;
6720 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0)
Victor Stinner8f674cc2013-04-17 23:02:17 +02006721 return NULL;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006722
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006723 e = s + size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006724 data = writer.data;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006725 outpos = ascii_decode(s, e, (Py_UCS1 *)data);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006726 writer.pos = outpos;
6727 if (writer.pos == size)
6728 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006729
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006730 s += writer.pos;
6731 kind = writer.kind;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006732 while (s < e) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02006733 unsigned char c = (unsigned char)*s;
Benjamin Peterson29060642009-01-31 22:14:21 +00006734 if (c < 128) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006735 PyUnicode_WRITE(kind, data, writer.pos, c);
6736 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00006737 ++s;
6738 }
6739 else {
6740 startinpos = s-starts;
6741 endinpos = startinpos + 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006742 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00006743 errors, &errorHandler,
6744 "ascii", "ordinal not in range(128)",
6745 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006746 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006747 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006748 kind = writer.kind;
6749 data = writer.data;
Benjamin Peterson29060642009-01-31 22:14:21 +00006750 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006751 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006752 Py_XDECREF(errorHandler);
6753 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006754 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006755
Benjamin Peterson29060642009-01-31 22:14:21 +00006756 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006757 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006758 Py_XDECREF(errorHandler);
6759 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006760 return NULL;
6761}
6762
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006763/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006764PyObject *
6765PyUnicode_EncodeASCII(const Py_UNICODE *p,
6766 Py_ssize_t size,
6767 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006768{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006769 PyObject *result;
6770 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6771 if (unicode == NULL)
6772 return NULL;
6773 result = unicode_encode_ucs1(unicode, errors, 128);
6774 Py_DECREF(unicode);
6775 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006776}
6777
Alexander Belopolsky40018472011-02-26 01:02:56 +00006778PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006779_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006780{
6781 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006782 PyErr_BadArgument();
6783 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006784 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006785 if (PyUnicode_READY(unicode) == -1)
6786 return NULL;
6787 /* Fast path: if it is an ASCII-only string, construct bytes object
6788 directly. Else defer to above function to raise the exception. */
Victor Stinneraf037572013-04-14 18:44:10 +02006789 if (PyUnicode_IS_ASCII(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006790 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6791 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006792 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006793}
6794
6795PyObject *
6796PyUnicode_AsASCIIString(PyObject *unicode)
6797{
6798 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006799}
6800
Victor Stinner99b95382011-07-04 14:23:54 +02006801#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006802
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006803/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006804
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006805#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006806#define NEED_RETRY
6807#endif
6808
Victor Stinner3a50e702011-10-18 21:21:00 +02006809#ifndef WC_ERR_INVALID_CHARS
6810# define WC_ERR_INVALID_CHARS 0x0080
6811#endif
6812
6813static char*
6814code_page_name(UINT code_page, PyObject **obj)
6815{
6816 *obj = NULL;
6817 if (code_page == CP_ACP)
6818 return "mbcs";
6819 if (code_page == CP_UTF7)
6820 return "CP_UTF7";
6821 if (code_page == CP_UTF8)
6822 return "CP_UTF8";
6823
6824 *obj = PyBytes_FromFormat("cp%u", code_page);
6825 if (*obj == NULL)
6826 return NULL;
6827 return PyBytes_AS_STRING(*obj);
6828}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006829
Alexander Belopolsky40018472011-02-26 01:02:56 +00006830static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006831is_dbcs_lead_byte(UINT code_page, const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006832{
6833 const char *curr = s + offset;
Victor Stinner3a50e702011-10-18 21:21:00 +02006834 const char *prev;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006835
Victor Stinner3a50e702011-10-18 21:21:00 +02006836 if (!IsDBCSLeadByteEx(code_page, *curr))
6837 return 0;
6838
6839 prev = CharPrevExA(code_page, s, curr, 0);
6840 if (prev == curr)
6841 return 1;
6842 /* FIXME: This code is limited to "true" double-byte encodings,
6843 as it assumes an incomplete character consists of a single
6844 byte. */
6845 if (curr - prev == 2)
6846 return 1;
6847 if (!IsDBCSLeadByteEx(code_page, *prev))
6848 return 1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006849 return 0;
6850}
6851
Victor Stinner3a50e702011-10-18 21:21:00 +02006852static DWORD
6853decode_code_page_flags(UINT code_page)
6854{
6855 if (code_page == CP_UTF7) {
6856 /* The CP_UTF7 decoder only supports flags=0 */
6857 return 0;
6858 }
6859 else
6860 return MB_ERR_INVALID_CHARS;
6861}
6862
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006863/*
Victor Stinner3a50e702011-10-18 21:21:00 +02006864 * Decode a byte string from a Windows code page into unicode object in strict
6865 * mode.
6866 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02006867 * Returns consumed size if succeed, returns -2 on decode error, or raise an
6868 * OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006869 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006870static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006871decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006872 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02006873 const char *in,
6874 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006875{
Victor Stinner3a50e702011-10-18 21:21:00 +02006876 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01006877 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02006878 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006879
6880 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02006881 assert(insize > 0);
6882 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
6883 if (outsize <= 0)
6884 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006885
6886 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006887 /* Create unicode object */
Victor Stinnerab595942011-12-17 04:59:06 +01006888 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01006889 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00006890 if (*v == NULL)
6891 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006892 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006893 }
6894 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006895 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02006896 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner16e6a802011-12-12 13:24:15 +01006897 if (unicode_resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006898 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006899 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006900 }
6901
6902 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02006903 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
6904 if (outsize <= 0)
6905 goto error;
6906 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00006907
Victor Stinner3a50e702011-10-18 21:21:00 +02006908error:
6909 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
6910 return -2;
6911 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00006912 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006913}
6914
Victor Stinner3a50e702011-10-18 21:21:00 +02006915/*
6916 * Decode a byte string from a code page into unicode object with an error
6917 * handler.
6918 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02006919 * Returns consumed size if succeed, or raise an OSError or
Victor Stinner3a50e702011-10-18 21:21:00 +02006920 * UnicodeDecodeError exception and returns -1 on error.
6921 */
6922static int
6923decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006924 PyObject **v,
6925 const char *in, const int size,
Victor Stinner3a50e702011-10-18 21:21:00 +02006926 const char *errors)
6927{
6928 const char *startin = in;
6929 const char *endin = in + size;
6930 const DWORD flags = decode_code_page_flags(code_page);
6931 /* Ideally, we should get reason from FormatMessage. This is the Windows
6932 2000 English version of the message. */
6933 const char *reason = "No mapping for the Unicode character exists "
6934 "in the target code page.";
6935 /* each step cannot decode more than 1 character, but a character can be
6936 represented as a surrogate pair */
6937 wchar_t buffer[2], *startout, *out;
Victor Stinner9f067f42013-06-05 00:21:31 +02006938 int insize;
6939 Py_ssize_t outsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02006940 PyObject *errorHandler = NULL;
6941 PyObject *exc = NULL;
6942 PyObject *encoding_obj = NULL;
6943 char *encoding;
6944 DWORD err;
6945 int ret = -1;
6946
6947 assert(size > 0);
6948
6949 encoding = code_page_name(code_page, &encoding_obj);
6950 if (encoding == NULL)
6951 return -1;
6952
6953 if (errors == NULL || strcmp(errors, "strict") == 0) {
6954 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
6955 UnicodeDecodeError. */
6956 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
6957 if (exc != NULL) {
6958 PyCodec_StrictErrors(exc);
6959 Py_CLEAR(exc);
6960 }
6961 goto error;
6962 }
6963
6964 if (*v == NULL) {
6965 /* Create unicode object */
6966 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6967 PyErr_NoMemory();
6968 goto error;
6969 }
Victor Stinnerab595942011-12-17 04:59:06 +01006970 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01006971 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02006972 if (*v == NULL)
6973 goto error;
6974 startout = PyUnicode_AS_UNICODE(*v);
6975 }
6976 else {
6977 /* Extend unicode object */
6978 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
6979 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6980 PyErr_NoMemory();
6981 goto error;
6982 }
Victor Stinner16e6a802011-12-12 13:24:15 +01006983 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02006984 goto error;
6985 startout = PyUnicode_AS_UNICODE(*v) + n;
6986 }
6987
6988 /* Decode the byte string character per character */
6989 out = startout;
6990 while (in < endin)
6991 {
6992 /* Decode a character */
6993 insize = 1;
6994 do
6995 {
6996 outsize = MultiByteToWideChar(code_page, flags,
6997 in, insize,
6998 buffer, Py_ARRAY_LENGTH(buffer));
6999 if (outsize > 0)
7000 break;
7001 err = GetLastError();
7002 if (err != ERROR_NO_UNICODE_TRANSLATION
7003 && err != ERROR_INSUFFICIENT_BUFFER)
7004 {
7005 PyErr_SetFromWindowsErr(0);
7006 goto error;
7007 }
7008 insize++;
7009 }
7010 /* 4=maximum length of a UTF-8 sequence */
7011 while (insize <= 4 && (in + insize) <= endin);
7012
7013 if (outsize <= 0) {
7014 Py_ssize_t startinpos, endinpos, outpos;
7015
7016 startinpos = in - startin;
7017 endinpos = startinpos + 1;
7018 outpos = out - PyUnicode_AS_UNICODE(*v);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007019 if (unicode_decode_call_errorhandler_wchar(
Victor Stinner3a50e702011-10-18 21:21:00 +02007020 errors, &errorHandler,
7021 encoding, reason,
7022 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01007023 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007024 {
7025 goto error;
7026 }
Victor Stinner596a6c42011-11-09 00:02:18 +01007027 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007028 }
7029 else {
7030 in += insize;
7031 memcpy(out, buffer, outsize * sizeof(wchar_t));
7032 out += outsize;
7033 }
7034 }
7035
7036 /* write a NUL character at the end */
7037 *out = 0;
7038
7039 /* Extend unicode object */
7040 outsize = out - startout;
7041 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner16e6a802011-12-12 13:24:15 +01007042 if (unicode_resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007043 goto error;
Victor Stinner76a31a62011-11-04 00:05:13 +01007044 ret = size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007045
7046error:
7047 Py_XDECREF(encoding_obj);
7048 Py_XDECREF(errorHandler);
7049 Py_XDECREF(exc);
7050 return ret;
7051}
7052
Victor Stinner3a50e702011-10-18 21:21:00 +02007053static PyObject *
7054decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007055 const char *s, Py_ssize_t size,
7056 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007057{
Victor Stinner76a31a62011-11-04 00:05:13 +01007058 PyObject *v = NULL;
7059 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007060
Victor Stinner3a50e702011-10-18 21:21:00 +02007061 if (code_page < 0) {
7062 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7063 return NULL;
7064 }
7065
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007066 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007067 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007068
Victor Stinner76a31a62011-11-04 00:05:13 +01007069 do
7070 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007071#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01007072 if (size > INT_MAX) {
7073 chunk_size = INT_MAX;
7074 final = 0;
7075 done = 0;
7076 }
7077 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007078#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007079 {
7080 chunk_size = (int)size;
7081 final = (consumed == NULL);
7082 done = 1;
7083 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007084
Victor Stinner76a31a62011-11-04 00:05:13 +01007085 /* Skip trailing lead-byte unless 'final' is set */
7086 if (!final && is_dbcs_lead_byte(code_page, s, chunk_size - 1))
7087 --chunk_size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007088
Victor Stinner76a31a62011-11-04 00:05:13 +01007089 if (chunk_size == 0 && done) {
7090 if (v != NULL)
7091 break;
Serhiy Storchaka678db842013-01-26 12:16:36 +02007092 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner76a31a62011-11-04 00:05:13 +01007093 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007094
Victor Stinner76a31a62011-11-04 00:05:13 +01007095
7096 converted = decode_code_page_strict(code_page, &v,
7097 s, chunk_size);
7098 if (converted == -2)
7099 converted = decode_code_page_errors(code_page, &v,
7100 s, chunk_size,
7101 errors);
7102 assert(converted != 0);
7103
7104 if (converted < 0) {
7105 Py_XDECREF(v);
7106 return NULL;
7107 }
7108
7109 if (consumed)
7110 *consumed += converted;
7111
7112 s += converted;
7113 size -= converted;
7114 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007115
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007116 return unicode_result(v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007117}
7118
Alexander Belopolsky40018472011-02-26 01:02:56 +00007119PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007120PyUnicode_DecodeCodePageStateful(int code_page,
7121 const char *s,
7122 Py_ssize_t size,
7123 const char *errors,
7124 Py_ssize_t *consumed)
7125{
7126 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7127}
7128
7129PyObject *
7130PyUnicode_DecodeMBCSStateful(const char *s,
7131 Py_ssize_t size,
7132 const char *errors,
7133 Py_ssize_t *consumed)
7134{
7135 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7136}
7137
7138PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007139PyUnicode_DecodeMBCS(const char *s,
7140 Py_ssize_t size,
7141 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007142{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007143 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7144}
7145
Victor Stinner3a50e702011-10-18 21:21:00 +02007146static DWORD
7147encode_code_page_flags(UINT code_page, const char *errors)
7148{
7149 if (code_page == CP_UTF8) {
7150 if (winver.dwMajorVersion >= 6)
7151 /* CP_UTF8 supports WC_ERR_INVALID_CHARS on Windows Vista
7152 and later */
7153 return WC_ERR_INVALID_CHARS;
7154 else
7155 /* CP_UTF8 only supports flags=0 on Windows older than Vista */
7156 return 0;
7157 }
7158 else if (code_page == CP_UTF7) {
7159 /* CP_UTF7 only supports flags=0 */
7160 return 0;
7161 }
7162 else {
7163 if (errors != NULL && strcmp(errors, "replace") == 0)
7164 return 0;
7165 else
7166 return WC_NO_BEST_FIT_CHARS;
7167 }
7168}
7169
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007170/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007171 * Encode a Unicode string to a Windows code page into a byte string in strict
7172 * mode.
7173 *
7174 * Returns consumed characters if succeed, returns -2 on encode error, or raise
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007175 * an OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007176 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007177static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007178encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007179 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007180 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007181{
Victor Stinner554f3f02010-06-16 23:33:54 +00007182 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007183 BOOL *pusedDefaultChar = &usedDefaultChar;
7184 int outsize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007185 PyObject *exc = NULL;
Victor Stinner24729f32011-11-10 20:31:37 +01007186 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007187 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007188 const DWORD flags = encode_code_page_flags(code_page, NULL);
7189 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007190 /* Create a substring so that we can get the UTF-16 representation
7191 of just the slice under consideration. */
7192 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007193
Martin v. Löwis3d325192011-11-04 18:23:06 +01007194 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007195
Victor Stinner3a50e702011-10-18 21:21:00 +02007196 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007197 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007198 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007199 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007200
Victor Stinner2fc507f2011-11-04 20:06:39 +01007201 substring = PyUnicode_Substring(unicode, offset, offset+len);
7202 if (substring == NULL)
7203 return -1;
7204 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7205 if (p == NULL) {
7206 Py_DECREF(substring);
7207 return -1;
7208 }
Victor Stinner9f067f42013-06-05 00:21:31 +02007209 assert(size <= INT_MAX);
Martin v. Löwis3d325192011-11-04 18:23:06 +01007210
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007211 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007212 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007213 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007214 NULL, 0,
7215 NULL, pusedDefaultChar);
7216 if (outsize <= 0)
7217 goto error;
7218 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007219 if (pusedDefaultChar && *pusedDefaultChar) {
7220 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007221 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007222 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007223
Victor Stinner3a50e702011-10-18 21:21:00 +02007224 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007225 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007226 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007227 if (*outbytes == NULL) {
7228 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007229 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007230 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007231 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007232 }
7233 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007234 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007235 const Py_ssize_t n = PyBytes_Size(*outbytes);
7236 if (outsize > PY_SSIZE_T_MAX - n) {
7237 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007238 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007239 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007240 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007241 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7242 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007243 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007244 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007245 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007246 }
7247
7248 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007249 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007250 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007251 out, outsize,
7252 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007253 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007254 if (outsize <= 0)
7255 goto error;
7256 if (pusedDefaultChar && *pusedDefaultChar)
7257 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007258 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007259
Victor Stinner3a50e702011-10-18 21:21:00 +02007260error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007261 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007262 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7263 return -2;
7264 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007265 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007266}
7267
Victor Stinner3a50e702011-10-18 21:21:00 +02007268/*
7269 * Encode a Unicode string to a Windows code page into a byte string using a
7270 * error handler.
7271 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007272 * Returns consumed characters if succeed, or raise an OSError and returns
Victor Stinner3a50e702011-10-18 21:21:00 +02007273 * -1 on other error.
7274 */
7275static int
7276encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007277 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007278 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007279{
Victor Stinner3a50e702011-10-18 21:21:00 +02007280 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007281 Py_ssize_t pos = unicode_offset;
7282 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007283 /* Ideally, we should get reason from FormatMessage. This is the Windows
7284 2000 English version of the message. */
7285 const char *reason = "invalid character";
7286 /* 4=maximum length of a UTF-8 sequence */
7287 char buffer[4];
7288 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7289 Py_ssize_t outsize;
7290 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007291 PyObject *errorHandler = NULL;
7292 PyObject *exc = NULL;
7293 PyObject *encoding_obj = NULL;
7294 char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007295 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007296 PyObject *rep;
7297 int ret = -1;
7298
7299 assert(insize > 0);
7300
7301 encoding = code_page_name(code_page, &encoding_obj);
7302 if (encoding == NULL)
7303 return -1;
7304
7305 if (errors == NULL || strcmp(errors, "strict") == 0) {
7306 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7307 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007308 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007309 if (exc != NULL) {
7310 PyCodec_StrictErrors(exc);
7311 Py_DECREF(exc);
7312 }
7313 Py_XDECREF(encoding_obj);
7314 return -1;
7315 }
7316
7317 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7318 pusedDefaultChar = &usedDefaultChar;
7319 else
7320 pusedDefaultChar = NULL;
7321
7322 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7323 PyErr_NoMemory();
7324 goto error;
7325 }
7326 outsize = insize * Py_ARRAY_LENGTH(buffer);
7327
7328 if (*outbytes == NULL) {
7329 /* Create string object */
7330 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7331 if (*outbytes == NULL)
7332 goto error;
7333 out = PyBytes_AS_STRING(*outbytes);
7334 }
7335 else {
7336 /* Extend string object */
7337 Py_ssize_t n = PyBytes_Size(*outbytes);
7338 if (n > PY_SSIZE_T_MAX - outsize) {
7339 PyErr_NoMemory();
7340 goto error;
7341 }
7342 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7343 goto error;
7344 out = PyBytes_AS_STRING(*outbytes) + n;
7345 }
7346
7347 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007348 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007349 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007350 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7351 wchar_t chars[2];
7352 int charsize;
7353 if (ch < 0x10000) {
7354 chars[0] = (wchar_t)ch;
7355 charsize = 1;
7356 }
7357 else {
Victor Stinner76df43d2012-10-30 01:42:39 +01007358 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7359 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007360 charsize = 2;
7361 }
7362
Victor Stinner3a50e702011-10-18 21:21:00 +02007363 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007364 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007365 buffer, Py_ARRAY_LENGTH(buffer),
7366 NULL, pusedDefaultChar);
7367 if (outsize > 0) {
7368 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7369 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007370 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007371 memcpy(out, buffer, outsize);
7372 out += outsize;
7373 continue;
7374 }
7375 }
7376 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7377 PyErr_SetFromWindowsErr(0);
7378 goto error;
7379 }
7380
Victor Stinner3a50e702011-10-18 21:21:00 +02007381 rep = unicode_encode_call_errorhandler(
7382 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007383 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007384 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007385 if (rep == NULL)
7386 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007387 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007388
7389 if (PyBytes_Check(rep)) {
7390 outsize = PyBytes_GET_SIZE(rep);
7391 if (outsize != 1) {
7392 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7393 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7394 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7395 Py_DECREF(rep);
7396 goto error;
7397 }
7398 out = PyBytes_AS_STRING(*outbytes) + offset;
7399 }
7400 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7401 out += outsize;
7402 }
7403 else {
7404 Py_ssize_t i;
7405 enum PyUnicode_Kind kind;
7406 void *data;
7407
Benjamin Petersonbac79492012-01-14 13:34:47 -05007408 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007409 Py_DECREF(rep);
7410 goto error;
7411 }
7412
7413 outsize = PyUnicode_GET_LENGTH(rep);
7414 if (outsize != 1) {
7415 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7416 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7417 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7418 Py_DECREF(rep);
7419 goto error;
7420 }
7421 out = PyBytes_AS_STRING(*outbytes) + offset;
7422 }
7423 kind = PyUnicode_KIND(rep);
7424 data = PyUnicode_DATA(rep);
7425 for (i=0; i < outsize; i++) {
7426 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7427 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007428 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007429 encoding, unicode,
7430 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007431 "unable to encode error handler result to ASCII");
7432 Py_DECREF(rep);
7433 goto error;
7434 }
7435 *out = (unsigned char)ch;
7436 out++;
7437 }
7438 }
7439 Py_DECREF(rep);
7440 }
7441 /* write a NUL byte */
7442 *out = 0;
7443 outsize = out - PyBytes_AS_STRING(*outbytes);
7444 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7445 if (_PyBytes_Resize(outbytes, outsize) < 0)
7446 goto error;
7447 ret = 0;
7448
7449error:
7450 Py_XDECREF(encoding_obj);
7451 Py_XDECREF(errorHandler);
7452 Py_XDECREF(exc);
7453 return ret;
7454}
7455
Victor Stinner3a50e702011-10-18 21:21:00 +02007456static PyObject *
7457encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007458 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007459 const char *errors)
7460{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007461 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007462 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007463 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007464 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007465
Benjamin Petersonbac79492012-01-14 13:34:47 -05007466 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007467 return NULL;
7468 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007469
Victor Stinner3a50e702011-10-18 21:21:00 +02007470 if (code_page < 0) {
7471 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7472 return NULL;
7473 }
7474
Martin v. Löwis3d325192011-11-04 18:23:06 +01007475 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007476 return PyBytes_FromStringAndSize(NULL, 0);
7477
Victor Stinner7581cef2011-11-03 22:32:33 +01007478 offset = 0;
7479 do
7480 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007481#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007482 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007483 chunks. */
7484 if (len > INT_MAX/2) {
7485 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007486 done = 0;
7487 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007488 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007489#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007490 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007491 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007492 done = 1;
7493 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007494
Victor Stinner76a31a62011-11-04 00:05:13 +01007495 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007496 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007497 errors);
7498 if (ret == -2)
7499 ret = encode_code_page_errors(code_page, &outbytes,
7500 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007501 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007502 if (ret < 0) {
7503 Py_XDECREF(outbytes);
7504 return NULL;
7505 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007506
Victor Stinner7581cef2011-11-03 22:32:33 +01007507 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007508 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007509 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007510
Victor Stinner3a50e702011-10-18 21:21:00 +02007511 return outbytes;
7512}
7513
7514PyObject *
7515PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7516 Py_ssize_t size,
7517 const char *errors)
7518{
Victor Stinner7581cef2011-11-03 22:32:33 +01007519 PyObject *unicode, *res;
7520 unicode = PyUnicode_FromUnicode(p, size);
7521 if (unicode == NULL)
7522 return NULL;
7523 res = encode_code_page(CP_ACP, unicode, errors);
7524 Py_DECREF(unicode);
7525 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007526}
7527
7528PyObject *
7529PyUnicode_EncodeCodePage(int code_page,
7530 PyObject *unicode,
7531 const char *errors)
7532{
Victor Stinner7581cef2011-11-03 22:32:33 +01007533 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007534}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007535
Alexander Belopolsky40018472011-02-26 01:02:56 +00007536PyObject *
7537PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007538{
7539 if (!PyUnicode_Check(unicode)) {
7540 PyErr_BadArgument();
7541 return NULL;
7542 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007543 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007544}
7545
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007546#undef NEED_RETRY
7547
Victor Stinner99b95382011-07-04 14:23:54 +02007548#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007549
Guido van Rossumd57fd912000-03-10 22:53:23 +00007550/* --- Character Mapping Codec -------------------------------------------- */
7551
Victor Stinnerfb161b12013-04-18 01:44:27 +02007552static int
7553charmap_decode_string(const char *s,
7554 Py_ssize_t size,
7555 PyObject *mapping,
7556 const char *errors,
7557 _PyUnicodeWriter *writer)
7558{
7559 const char *starts = s;
7560 const char *e;
7561 Py_ssize_t startinpos, endinpos;
7562 PyObject *errorHandler = NULL, *exc = NULL;
7563 Py_ssize_t maplen;
7564 enum PyUnicode_Kind mapkind;
7565 void *mapdata;
7566 Py_UCS4 x;
7567 unsigned char ch;
7568
7569 if (PyUnicode_READY(mapping) == -1)
7570 return -1;
7571
7572 maplen = PyUnicode_GET_LENGTH(mapping);
7573 mapdata = PyUnicode_DATA(mapping);
7574 mapkind = PyUnicode_KIND(mapping);
7575
7576 e = s + size;
7577
7578 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
7579 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
7580 * is disabled in encoding aliases, latin1 is preferred because
7581 * its implementation is faster. */
7582 Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata;
7583 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7584 Py_UCS4 maxchar = writer->maxchar;
7585
7586 assert (writer->kind == PyUnicode_1BYTE_KIND);
7587 while (s < e) {
7588 ch = *s;
7589 x = mapdata_ucs1[ch];
7590 if (x > maxchar) {
7591 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
7592 goto onError;
7593 maxchar = writer->maxchar;
7594 outdata = (Py_UCS1 *)writer->data;
7595 }
7596 outdata[writer->pos] = x;
7597 writer->pos++;
7598 ++s;
7599 }
7600 return 0;
7601 }
7602
7603 while (s < e) {
7604 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
7605 enum PyUnicode_Kind outkind = writer->kind;
7606 Py_UCS2 *mapdata_ucs2 = (Py_UCS2 *)mapdata;
7607 if (outkind == PyUnicode_1BYTE_KIND) {
7608 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7609 Py_UCS4 maxchar = writer->maxchar;
7610 while (s < e) {
7611 ch = *s;
7612 x = mapdata_ucs2[ch];
7613 if (x > maxchar)
7614 goto Error;
7615 outdata[writer->pos] = x;
7616 writer->pos++;
7617 ++s;
7618 }
7619 break;
7620 }
7621 else if (outkind == PyUnicode_2BYTE_KIND) {
7622 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
7623 while (s < e) {
7624 ch = *s;
7625 x = mapdata_ucs2[ch];
7626 if (x == 0xFFFE)
7627 goto Error;
7628 outdata[writer->pos] = x;
7629 writer->pos++;
7630 ++s;
7631 }
7632 break;
7633 }
7634 }
7635 ch = *s;
7636
7637 if (ch < maplen)
7638 x = PyUnicode_READ(mapkind, mapdata, ch);
7639 else
7640 x = 0xfffe; /* invalid value */
7641Error:
7642 if (x == 0xfffe)
7643 {
7644 /* undefined mapping */
7645 startinpos = s-starts;
7646 endinpos = startinpos+1;
7647 if (unicode_decode_call_errorhandler_writer(
7648 errors, &errorHandler,
7649 "charmap", "character maps to <undefined>",
7650 &starts, &e, &startinpos, &endinpos, &exc, &s,
7651 writer)) {
7652 goto onError;
7653 }
7654 continue;
7655 }
7656
7657 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
7658 goto onError;
7659 ++s;
7660 }
7661 Py_XDECREF(errorHandler);
7662 Py_XDECREF(exc);
7663 return 0;
7664
7665onError:
7666 Py_XDECREF(errorHandler);
7667 Py_XDECREF(exc);
7668 return -1;
7669}
7670
7671static int
7672charmap_decode_mapping(const char *s,
7673 Py_ssize_t size,
7674 PyObject *mapping,
7675 const char *errors,
7676 _PyUnicodeWriter *writer)
7677{
7678 const char *starts = s;
7679 const char *e;
7680 Py_ssize_t startinpos, endinpos;
7681 PyObject *errorHandler = NULL, *exc = NULL;
7682 unsigned char ch;
Victor Stinnerf4f24242013-05-07 01:01:31 +02007683 PyObject *key, *item = NULL;
Victor Stinnerfb161b12013-04-18 01:44:27 +02007684
7685 e = s + size;
7686
7687 while (s < e) {
7688 ch = *s;
7689
7690 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7691 key = PyLong_FromLong((long)ch);
7692 if (key == NULL)
7693 goto onError;
7694
7695 item = PyObject_GetItem(mapping, key);
7696 Py_DECREF(key);
7697 if (item == NULL) {
7698 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7699 /* No mapping found means: mapping is undefined. */
7700 PyErr_Clear();
7701 goto Undefined;
7702 } else
7703 goto onError;
7704 }
7705
7706 /* Apply mapping */
7707 if (item == Py_None)
7708 goto Undefined;
7709 if (PyLong_Check(item)) {
7710 long value = PyLong_AS_LONG(item);
7711 if (value == 0xFFFE)
7712 goto Undefined;
7713 if (value < 0 || value > MAX_UNICODE) {
7714 PyErr_Format(PyExc_TypeError,
7715 "character mapping must be in range(0x%lx)",
7716 (unsigned long)MAX_UNICODE + 1);
7717 goto onError;
7718 }
7719
7720 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7721 goto onError;
7722 }
7723 else if (PyUnicode_Check(item)) {
7724 if (PyUnicode_READY(item) == -1)
7725 goto onError;
7726 if (PyUnicode_GET_LENGTH(item) == 1) {
7727 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
7728 if (value == 0xFFFE)
7729 goto Undefined;
7730 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7731 goto onError;
7732 }
7733 else {
7734 writer->overallocate = 1;
7735 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
7736 goto onError;
7737 }
7738 }
7739 else {
7740 /* wrong return value */
7741 PyErr_SetString(PyExc_TypeError,
7742 "character mapping must return integer, None or str");
7743 goto onError;
7744 }
7745 Py_CLEAR(item);
7746 ++s;
7747 continue;
7748
7749Undefined:
7750 /* undefined mapping */
7751 Py_CLEAR(item);
7752 startinpos = s-starts;
7753 endinpos = startinpos+1;
7754 if (unicode_decode_call_errorhandler_writer(
7755 errors, &errorHandler,
7756 "charmap", "character maps to <undefined>",
7757 &starts, &e, &startinpos, &endinpos, &exc, &s,
7758 writer)) {
7759 goto onError;
7760 }
7761 }
7762 Py_XDECREF(errorHandler);
7763 Py_XDECREF(exc);
7764 return 0;
7765
7766onError:
7767 Py_XDECREF(item);
7768 Py_XDECREF(errorHandler);
7769 Py_XDECREF(exc);
7770 return -1;
7771}
7772
Alexander Belopolsky40018472011-02-26 01:02:56 +00007773PyObject *
7774PyUnicode_DecodeCharmap(const char *s,
7775 Py_ssize_t size,
7776 PyObject *mapping,
7777 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007778{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007779 _PyUnicodeWriter writer;
Tim Petersced69f82003-09-16 20:30:58 +00007780
Guido van Rossumd57fd912000-03-10 22:53:23 +00007781 /* Default to Latin-1 */
7782 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007783 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007784
Guido van Rossumd57fd912000-03-10 22:53:23 +00007785 if (size == 0)
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02007786 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner8f674cc2013-04-17 23:02:17 +02007787 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02007788 writer.min_length = size;
7789 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007790 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007791
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007792 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerfb161b12013-04-18 01:44:27 +02007793 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
7794 goto onError;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007795 }
7796 else {
Victor Stinnerfb161b12013-04-18 01:44:27 +02007797 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
7798 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007799 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007800 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00007801
Benjamin Peterson29060642009-01-31 22:14:21 +00007802 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007803 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007804 return NULL;
7805}
7806
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007807/* Charmap encoding: the lookup table */
7808
Alexander Belopolsky40018472011-02-26 01:02:56 +00007809struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00007810 PyObject_HEAD
7811 unsigned char level1[32];
7812 int count2, count3;
7813 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007814};
7815
7816static PyObject*
7817encoding_map_size(PyObject *obj, PyObject* args)
7818{
7819 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007820 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00007821 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007822}
7823
7824static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007825 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00007826 PyDoc_STR("Return the size (in bytes) of this object") },
7827 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007828};
7829
7830static void
7831encoding_map_dealloc(PyObject* o)
7832{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007833 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007834}
7835
7836static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007837 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007838 "EncodingMap", /*tp_name*/
7839 sizeof(struct encoding_map), /*tp_basicsize*/
7840 0, /*tp_itemsize*/
7841 /* methods */
7842 encoding_map_dealloc, /*tp_dealloc*/
7843 0, /*tp_print*/
7844 0, /*tp_getattr*/
7845 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00007846 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00007847 0, /*tp_repr*/
7848 0, /*tp_as_number*/
7849 0, /*tp_as_sequence*/
7850 0, /*tp_as_mapping*/
7851 0, /*tp_hash*/
7852 0, /*tp_call*/
7853 0, /*tp_str*/
7854 0, /*tp_getattro*/
7855 0, /*tp_setattro*/
7856 0, /*tp_as_buffer*/
7857 Py_TPFLAGS_DEFAULT, /*tp_flags*/
7858 0, /*tp_doc*/
7859 0, /*tp_traverse*/
7860 0, /*tp_clear*/
7861 0, /*tp_richcompare*/
7862 0, /*tp_weaklistoffset*/
7863 0, /*tp_iter*/
7864 0, /*tp_iternext*/
7865 encoding_map_methods, /*tp_methods*/
7866 0, /*tp_members*/
7867 0, /*tp_getset*/
7868 0, /*tp_base*/
7869 0, /*tp_dict*/
7870 0, /*tp_descr_get*/
7871 0, /*tp_descr_set*/
7872 0, /*tp_dictoffset*/
7873 0, /*tp_init*/
7874 0, /*tp_alloc*/
7875 0, /*tp_new*/
7876 0, /*tp_free*/
7877 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007878};
7879
7880PyObject*
7881PyUnicode_BuildEncodingMap(PyObject* string)
7882{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007883 PyObject *result;
7884 struct encoding_map *mresult;
7885 int i;
7886 int need_dict = 0;
7887 unsigned char level1[32];
7888 unsigned char level2[512];
7889 unsigned char *mlevel1, *mlevel2, *mlevel3;
7890 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007891 int kind;
7892 void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007893 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007894 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007895
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007896 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007897 PyErr_BadArgument();
7898 return NULL;
7899 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007900 kind = PyUnicode_KIND(string);
7901 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007902 length = PyUnicode_GET_LENGTH(string);
7903 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007904 memset(level1, 0xFF, sizeof level1);
7905 memset(level2, 0xFF, sizeof level2);
7906
7907 /* If there isn't a one-to-one mapping of NULL to \0,
7908 or if there are non-BMP characters, we need to use
7909 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007910 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007911 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007912 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007913 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007914 ch = PyUnicode_READ(kind, data, i);
7915 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007916 need_dict = 1;
7917 break;
7918 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007919 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007920 /* unmapped character */
7921 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007922 l1 = ch >> 11;
7923 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007924 if (level1[l1] == 0xFF)
7925 level1[l1] = count2++;
7926 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00007927 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007928 }
7929
7930 if (count2 >= 0xFF || count3 >= 0xFF)
7931 need_dict = 1;
7932
7933 if (need_dict) {
7934 PyObject *result = PyDict_New();
7935 PyObject *key, *value;
7936 if (!result)
7937 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007938 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007939 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00007940 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007941 if (!key || !value)
7942 goto failed1;
7943 if (PyDict_SetItem(result, key, value) == -1)
7944 goto failed1;
7945 Py_DECREF(key);
7946 Py_DECREF(value);
7947 }
7948 return result;
7949 failed1:
7950 Py_XDECREF(key);
7951 Py_XDECREF(value);
7952 Py_DECREF(result);
7953 return NULL;
7954 }
7955
7956 /* Create a three-level trie */
7957 result = PyObject_MALLOC(sizeof(struct encoding_map) +
7958 16*count2 + 128*count3 - 1);
7959 if (!result)
7960 return PyErr_NoMemory();
7961 PyObject_Init(result, &EncodingMapType);
7962 mresult = (struct encoding_map*)result;
7963 mresult->count2 = count2;
7964 mresult->count3 = count3;
7965 mlevel1 = mresult->level1;
7966 mlevel2 = mresult->level23;
7967 mlevel3 = mresult->level23 + 16*count2;
7968 memcpy(mlevel1, level1, 32);
7969 memset(mlevel2, 0xFF, 16*count2);
7970 memset(mlevel3, 0, 128*count3);
7971 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007972 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007973 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007974 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7975 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007976 /* unmapped character */
7977 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007978 o1 = ch>>11;
7979 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007980 i2 = 16*mlevel1[o1] + o2;
7981 if (mlevel2[i2] == 0xFF)
7982 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007983 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007984 i3 = 128*mlevel2[i2] + o3;
7985 mlevel3[i3] = i;
7986 }
7987 return result;
7988}
7989
7990static int
Victor Stinner22168992011-11-20 17:09:18 +01007991encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007992{
7993 struct encoding_map *map = (struct encoding_map*)mapping;
7994 int l1 = c>>11;
7995 int l2 = (c>>7) & 0xF;
7996 int l3 = c & 0x7F;
7997 int i;
7998
Victor Stinner22168992011-11-20 17:09:18 +01007999 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00008000 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008001 if (c == 0)
8002 return 0;
8003 /* level 1*/
8004 i = map->level1[l1];
8005 if (i == 0xFF) {
8006 return -1;
8007 }
8008 /* level 2*/
8009 i = map->level23[16*i+l2];
8010 if (i == 0xFF) {
8011 return -1;
8012 }
8013 /* level 3 */
8014 i = map->level23[16*map->count2 + 128*i + l3];
8015 if (i == 0) {
8016 return -1;
8017 }
8018 return i;
8019}
8020
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008021/* Lookup the character ch in the mapping. If the character
8022 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00008023 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008024static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01008025charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008026{
Christian Heimes217cfd12007-12-02 14:31:20 +00008027 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008028 PyObject *x;
8029
8030 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008031 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008032 x = PyObject_GetItem(mapping, w);
8033 Py_DECREF(w);
8034 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008035 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8036 /* No mapping found means: mapping is undefined. */
8037 PyErr_Clear();
8038 x = Py_None;
8039 Py_INCREF(x);
8040 return x;
8041 } else
8042 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008043 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008044 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008045 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008046 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008047 long value = PyLong_AS_LONG(x);
8048 if (value < 0 || value > 255) {
8049 PyErr_SetString(PyExc_TypeError,
8050 "character mapping must be in range(256)");
8051 Py_DECREF(x);
8052 return NULL;
8053 }
8054 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008055 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008056 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008057 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008058 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008059 /* wrong return value */
8060 PyErr_Format(PyExc_TypeError,
8061 "character mapping must return integer, bytes or None, not %.400s",
8062 x->ob_type->tp_name);
8063 Py_DECREF(x);
8064 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008065 }
8066}
8067
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008068static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008069charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008070{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008071 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8072 /* exponentially overallocate to minimize reallocations */
8073 if (requiredsize < 2*outsize)
8074 requiredsize = 2*outsize;
8075 if (_PyBytes_Resize(outobj, requiredsize))
8076 return -1;
8077 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008078}
8079
Benjamin Peterson14339b62009-01-31 16:36:08 +00008080typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008081 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008082} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008083/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008084 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008085 space is available. Return a new reference to the object that
8086 was put in the output buffer, or Py_None, if the mapping was undefined
8087 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008088 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008089static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008090charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008091 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008092{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008093 PyObject *rep;
8094 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008095 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008096
Christian Heimes90aa7642007-12-19 02:45:37 +00008097 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008098 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008099 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008100 if (res == -1)
8101 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008102 if (outsize<requiredsize)
8103 if (charmapencode_resize(outobj, outpos, requiredsize))
8104 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008105 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008106 outstart[(*outpos)++] = (char)res;
8107 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008108 }
8109
8110 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008111 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008112 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008113 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008114 Py_DECREF(rep);
8115 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008116 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008117 if (PyLong_Check(rep)) {
8118 Py_ssize_t requiredsize = *outpos+1;
8119 if (outsize<requiredsize)
8120 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8121 Py_DECREF(rep);
8122 return enc_EXCEPTION;
8123 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008124 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008125 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008126 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008127 else {
8128 const char *repchars = PyBytes_AS_STRING(rep);
8129 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8130 Py_ssize_t requiredsize = *outpos+repsize;
8131 if (outsize<requiredsize)
8132 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8133 Py_DECREF(rep);
8134 return enc_EXCEPTION;
8135 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008136 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008137 memcpy(outstart + *outpos, repchars, repsize);
8138 *outpos += repsize;
8139 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008140 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008141 Py_DECREF(rep);
8142 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008143}
8144
8145/* handle an error in PyUnicode_EncodeCharmap
8146 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008147static int
8148charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008149 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008150 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00008151 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008152 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008153{
8154 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008155 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008156 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008157 enum PyUnicode_Kind kind;
8158 void *data;
8159 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008160 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008161 Py_ssize_t collstartpos = *inpos;
8162 Py_ssize_t collendpos = *inpos+1;
8163 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008164 char *encoding = "charmap";
8165 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008166 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008167 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008168 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008169
Benjamin Petersonbac79492012-01-14 13:34:47 -05008170 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008171 return -1;
8172 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008173 /* find all unencodable characters */
8174 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008175 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008176 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008177 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008178 val = encoding_map_lookup(ch, mapping);
8179 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008180 break;
8181 ++collendpos;
8182 continue;
8183 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008184
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008185 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8186 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008187 if (rep==NULL)
8188 return -1;
8189 else if (rep!=Py_None) {
8190 Py_DECREF(rep);
8191 break;
8192 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008193 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008194 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008195 }
8196 /* cache callback name lookup
8197 * (if not done yet, i.e. it's the first error) */
8198 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008199 if ((errors==NULL) || (!strcmp(errors, "strict")))
8200 *known_errorHandler = 1;
8201 else if (!strcmp(errors, "replace"))
8202 *known_errorHandler = 2;
8203 else if (!strcmp(errors, "ignore"))
8204 *known_errorHandler = 3;
8205 else if (!strcmp(errors, "xmlcharrefreplace"))
8206 *known_errorHandler = 4;
8207 else
8208 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008209 }
8210 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008211 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008212 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008213 return -1;
8214 case 2: /* replace */
8215 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008216 x = charmapencode_output('?', mapping, res, respos);
8217 if (x==enc_EXCEPTION) {
8218 return -1;
8219 }
8220 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008221 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008222 return -1;
8223 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008224 }
8225 /* fall through */
8226 case 3: /* ignore */
8227 *inpos = collendpos;
8228 break;
8229 case 4: /* xmlcharrefreplace */
8230 /* generate replacement (temporarily (mis)uses p) */
8231 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008232 char buffer[2+29+1+1];
8233 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008234 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008235 for (cp = buffer; *cp; ++cp) {
8236 x = charmapencode_output(*cp, mapping, res, respos);
8237 if (x==enc_EXCEPTION)
8238 return -1;
8239 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008240 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008241 return -1;
8242 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008243 }
8244 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008245 *inpos = collendpos;
8246 break;
8247 default:
8248 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008249 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008250 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008251 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008252 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008253 if (PyBytes_Check(repunicode)) {
8254 /* Directly copy bytes result to output. */
8255 Py_ssize_t outsize = PyBytes_Size(*res);
8256 Py_ssize_t requiredsize;
8257 repsize = PyBytes_Size(repunicode);
8258 requiredsize = *respos + repsize;
8259 if (requiredsize > outsize)
8260 /* Make room for all additional bytes. */
8261 if (charmapencode_resize(res, respos, requiredsize)) {
8262 Py_DECREF(repunicode);
8263 return -1;
8264 }
8265 memcpy(PyBytes_AsString(*res) + *respos,
8266 PyBytes_AsString(repunicode), repsize);
8267 *respos += repsize;
8268 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008269 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008270 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008271 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008272 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008273 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008274 Py_DECREF(repunicode);
8275 return -1;
8276 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008277 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008278 data = PyUnicode_DATA(repunicode);
8279 kind = PyUnicode_KIND(repunicode);
8280 for (index = 0; index < repsize; index++) {
8281 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8282 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008283 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008284 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008285 return -1;
8286 }
8287 else if (x==enc_FAILED) {
8288 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008289 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008290 return -1;
8291 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008292 }
8293 *inpos = newpos;
8294 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008295 }
8296 return 0;
8297}
8298
Alexander Belopolsky40018472011-02-26 01:02:56 +00008299PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008300_PyUnicode_EncodeCharmap(PyObject *unicode,
8301 PyObject *mapping,
8302 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008303{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008304 /* output object */
8305 PyObject *res = NULL;
8306 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008307 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008308 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008309 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008310 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008311 PyObject *errorHandler = NULL;
8312 PyObject *exc = NULL;
8313 /* the following variable is used for caching string comparisons
8314 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8315 * 3=ignore, 4=xmlcharrefreplace */
8316 int known_errorHandler = -1;
Victor Stinner69ed0f42013-04-09 21:48:24 +02008317 void *data;
8318 int kind;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008319
Benjamin Petersonbac79492012-01-14 13:34:47 -05008320 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008321 return NULL;
8322 size = PyUnicode_GET_LENGTH(unicode);
Victor Stinner69ed0f42013-04-09 21:48:24 +02008323 data = PyUnicode_DATA(unicode);
8324 kind = PyUnicode_KIND(unicode);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008325
Guido van Rossumd57fd912000-03-10 22:53:23 +00008326 /* Default to Latin-1 */
8327 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008328 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008329
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008330 /* allocate enough for a simple encoding without
8331 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008332 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008333 if (res == NULL)
8334 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008335 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008336 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008337
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008338 while (inpos<size) {
Victor Stinner69ed0f42013-04-09 21:48:24 +02008339 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008340 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008341 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008342 if (x==enc_EXCEPTION) /* error */
8343 goto onError;
8344 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008345 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008346 &exc,
8347 &known_errorHandler, &errorHandler, errors,
8348 &res, &respos)) {
8349 goto onError;
8350 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008351 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008352 else
8353 /* done with this character => adjust input position */
8354 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008355 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008356
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008357 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008358 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008359 if (_PyBytes_Resize(&res, respos) < 0)
8360 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008361
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008362 Py_XDECREF(exc);
8363 Py_XDECREF(errorHandler);
8364 return res;
8365
Benjamin Peterson29060642009-01-31 22:14:21 +00008366 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008367 Py_XDECREF(res);
8368 Py_XDECREF(exc);
8369 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008370 return NULL;
8371}
8372
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008373/* Deprecated */
8374PyObject *
8375PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8376 Py_ssize_t size,
8377 PyObject *mapping,
8378 const char *errors)
8379{
8380 PyObject *result;
8381 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8382 if (unicode == NULL)
8383 return NULL;
8384 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8385 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008386 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008387}
8388
Alexander Belopolsky40018472011-02-26 01:02:56 +00008389PyObject *
8390PyUnicode_AsCharmapString(PyObject *unicode,
8391 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008392{
8393 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008394 PyErr_BadArgument();
8395 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008396 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008397 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008398}
8399
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008400/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008401static void
8402make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008403 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008404 Py_ssize_t startpos, Py_ssize_t endpos,
8405 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008406{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008407 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008408 *exceptionObject = _PyUnicodeTranslateError_Create(
8409 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008410 }
8411 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008412 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8413 goto onError;
8414 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8415 goto onError;
8416 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8417 goto onError;
8418 return;
8419 onError:
8420 Py_DECREF(*exceptionObject);
8421 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008422 }
8423}
8424
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008425/* error handling callback helper:
8426 build arguments, call the callback and check the arguments,
8427 put the result into newpos and return the replacement string, which
8428 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008429static PyObject *
8430unicode_translate_call_errorhandler(const char *errors,
8431 PyObject **errorHandler,
8432 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008433 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008434 Py_ssize_t startpos, Py_ssize_t endpos,
8435 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008436{
Benjamin Peterson142957c2008-07-04 19:55:29 +00008437 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008438
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008439 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008440 PyObject *restuple;
8441 PyObject *resunicode;
8442
8443 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008444 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008445 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008446 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008447 }
8448
8449 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008450 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008451 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008452 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008453
8454 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00008455 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008456 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008457 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008458 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00008459 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008460 Py_DECREF(restuple);
8461 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008462 }
8463 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00008464 &resunicode, &i_newpos)) {
8465 Py_DECREF(restuple);
8466 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008467 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008468 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008469 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008470 else
8471 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008472 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008473 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
8474 Py_DECREF(restuple);
8475 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008476 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008477 Py_INCREF(resunicode);
8478 Py_DECREF(restuple);
8479 return resunicode;
8480}
8481
8482/* Lookup the character ch in the mapping and put the result in result,
8483 which must be decrefed by the caller.
8484 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008485static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008486charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008487{
Christian Heimes217cfd12007-12-02 14:31:20 +00008488 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008489 PyObject *x;
8490
8491 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008492 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008493 x = PyObject_GetItem(mapping, w);
8494 Py_DECREF(w);
8495 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008496 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8497 /* No mapping found means: use 1:1 mapping. */
8498 PyErr_Clear();
8499 *result = NULL;
8500 return 0;
8501 } else
8502 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008503 }
8504 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008505 *result = x;
8506 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008507 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008508 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008509 long value = PyLong_AS_LONG(x);
8510 long max = PyUnicode_GetMax();
8511 if (value < 0 || value > max) {
8512 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00008513 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008514 Py_DECREF(x);
8515 return -1;
8516 }
8517 *result = x;
8518 return 0;
8519 }
8520 else if (PyUnicode_Check(x)) {
8521 *result = x;
8522 return 0;
8523 }
8524 else {
8525 /* wrong return value */
8526 PyErr_SetString(PyExc_TypeError,
8527 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008528 Py_DECREF(x);
8529 return -1;
8530 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008531}
8532/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00008533 if not reallocate and adjust various state variables.
8534 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008535static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008536charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00008537 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008538{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008539 Py_ssize_t oldsize = *psize;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +00008540 Py_UCS4 *new_outobj;
Walter Dörwald4894c302003-10-24 14:25:28 +00008541 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008542 /* exponentially overallocate to minimize reallocations */
8543 if (requiredsize < 2 * oldsize)
8544 requiredsize = 2 * oldsize;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +00008545 new_outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
8546 if (new_outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008547 return -1;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +00008548 *outobj = new_outobj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008549 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008550 }
8551 return 0;
8552}
8553/* lookup the character, put the result in the output string and adjust
8554 various state variables. Return a new reference to the object that
8555 was put in the output buffer in *result, or Py_None, if the mapping was
8556 undefined (in which case no character was written).
8557 The called must decref result.
8558 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008559static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008560charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
8561 PyObject *mapping, Py_UCS4 **output,
8562 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008563 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008564{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008565 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
8566 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00008567 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008568 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008569 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008570 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008571 }
8572 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008573 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00008574 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008575 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008576 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008577 }
8578 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008579 Py_ssize_t repsize;
8580 if (PyUnicode_READY(*res) == -1)
8581 return -1;
8582 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00008583 if (repsize==1) {
8584 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008585 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00008586 }
8587 else if (repsize!=0) {
8588 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008589 Py_ssize_t requiredsize = *opos +
8590 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00008591 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008592 Py_ssize_t i;
8593 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00008594 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008595 for(i = 0; i < repsize; i++)
8596 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00008597 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008598 }
8599 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008600 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008601 return 0;
8602}
8603
Alexander Belopolsky40018472011-02-26 01:02:56 +00008604PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008605_PyUnicode_TranslateCharmap(PyObject *input,
8606 PyObject *mapping,
8607 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008608{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008609 /* input object */
8610 char *idata;
8611 Py_ssize_t size, i;
8612 int kind;
8613 /* output buffer */
8614 Py_UCS4 *output = NULL;
8615 Py_ssize_t osize;
8616 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008617 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008618 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008619 char *reason = "character maps to <undefined>";
8620 PyObject *errorHandler = NULL;
8621 PyObject *exc = NULL;
8622 /* the following variable is used for caching string comparisons
8623 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8624 * 3=ignore, 4=xmlcharrefreplace */
8625 int known_errorHandler = -1;
8626
Guido van Rossumd57fd912000-03-10 22:53:23 +00008627 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008628 PyErr_BadArgument();
8629 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008630 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008631
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008632 if (PyUnicode_READY(input) == -1)
8633 return NULL;
8634 idata = (char*)PyUnicode_DATA(input);
8635 kind = PyUnicode_KIND(input);
8636 size = PyUnicode_GET_LENGTH(input);
8637 i = 0;
8638
8639 if (size == 0) {
8640 Py_INCREF(input);
8641 return input;
8642 }
8643
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008644 /* allocate enough for a simple 1:1 translation without
8645 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008646 osize = size;
8647 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
8648 opos = 0;
8649 if (output == NULL) {
8650 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00008651 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008652 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008653
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008654 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008655 /* try to encode it */
8656 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008657 if (charmaptranslate_output(input, i, mapping,
8658 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008659 Py_XDECREF(x);
8660 goto onError;
8661 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008662 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00008663 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008664 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00008665 else { /* untranslatable character */
8666 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8667 Py_ssize_t repsize;
8668 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008669 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00008670 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008671 Py_ssize_t collstart = i;
8672 Py_ssize_t collend = i+1;
8673 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008674
Benjamin Peterson29060642009-01-31 22:14:21 +00008675 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008676 while (collend < size) {
8677 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008678 goto onError;
8679 Py_XDECREF(x);
8680 if (x!=Py_None)
8681 break;
8682 ++collend;
8683 }
8684 /* cache callback name lookup
8685 * (if not done yet, i.e. it's the first error) */
8686 if (known_errorHandler==-1) {
8687 if ((errors==NULL) || (!strcmp(errors, "strict")))
8688 known_errorHandler = 1;
8689 else if (!strcmp(errors, "replace"))
8690 known_errorHandler = 2;
8691 else if (!strcmp(errors, "ignore"))
8692 known_errorHandler = 3;
8693 else if (!strcmp(errors, "xmlcharrefreplace"))
8694 known_errorHandler = 4;
8695 else
8696 known_errorHandler = 0;
8697 }
8698 switch (known_errorHandler) {
8699 case 1: /* strict */
Victor Stinner6fa62752012-10-23 02:51:50 +02008700 make_translate_exception(&exc,
8701 input, collstart, collend, reason);
8702 if (exc != NULL)
8703 PyCodec_StrictErrors(exc);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008704 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00008705 case 2: /* replace */
8706 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008707 for (coll = collstart; coll<collend; coll++)
8708 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00008709 /* fall through */
8710 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008711 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008712 break;
8713 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008714 /* generate replacement (temporarily (mis)uses i) */
8715 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008716 char buffer[2+29+1+1];
8717 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008718 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
8719 if (charmaptranslate_makespace(&output, &osize,
8720 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00008721 goto onError;
8722 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008723 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00008724 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008725 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008726 break;
8727 default:
8728 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008729 reason, input, &exc,
8730 collstart, collend, &newpos);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008731 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008732 goto onError;
Benjamin Peterson9ca3ffa2012-01-01 16:04:29 -06008733 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008734 Py_DECREF(repunicode);
8735 goto onError;
8736 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008737 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008738 repsize = PyUnicode_GET_LENGTH(repunicode);
8739 if (charmaptranslate_makespace(&output, &osize,
8740 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008741 Py_DECREF(repunicode);
8742 goto onError;
8743 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008744 for (uni2 = 0; repsize-->0; ++uni2)
8745 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
8746 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00008747 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008748 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008749 }
8750 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008751 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
8752 if (!res)
8753 goto onError;
8754 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008755 Py_XDECREF(exc);
8756 Py_XDECREF(errorHandler);
8757 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008758
Benjamin Peterson29060642009-01-31 22:14:21 +00008759 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008760 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008761 Py_XDECREF(exc);
8762 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008763 return NULL;
8764}
8765
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008766/* Deprecated. Use PyUnicode_Translate instead. */
8767PyObject *
8768PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8769 Py_ssize_t size,
8770 PyObject *mapping,
8771 const char *errors)
8772{
Christian Heimes5f520f42012-09-11 14:03:25 +02008773 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008774 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8775 if (!unicode)
8776 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02008777 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8778 Py_DECREF(unicode);
8779 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008780}
8781
Alexander Belopolsky40018472011-02-26 01:02:56 +00008782PyObject *
8783PyUnicode_Translate(PyObject *str,
8784 PyObject *mapping,
8785 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008786{
8787 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008788
Guido van Rossumd57fd912000-03-10 22:53:23 +00008789 str = PyUnicode_FromObject(str);
8790 if (str == NULL)
Christian Heimes5f520f42012-09-11 14:03:25 +02008791 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008792 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008793 Py_DECREF(str);
8794 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008795}
Tim Petersced69f82003-09-16 20:30:58 +00008796
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008797static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008798fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008799{
8800 /* No need to call PyUnicode_READY(self) because this function is only
8801 called as a callback from fixup() which does it already. */
8802 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8803 const int kind = PyUnicode_KIND(self);
8804 void *data = PyUnicode_DATA(self);
Victor Stinnere6abb482012-05-02 01:15:40 +02008805 Py_UCS4 maxchar = 127, ch, fixed;
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008806 int modified = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008807 Py_ssize_t i;
8808
8809 for (i = 0; i < len; ++i) {
8810 ch = PyUnicode_READ(kind, data, i);
8811 fixed = 0;
8812 if (ch > 127) {
8813 if (Py_UNICODE_ISSPACE(ch))
8814 fixed = ' ';
8815 else {
8816 const int decimal = Py_UNICODE_TODECIMAL(ch);
8817 if (decimal >= 0)
8818 fixed = '0' + decimal;
8819 }
8820 if (fixed != 0) {
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008821 modified = 1;
Benjamin Peterson7e303732013-06-10 09:19:46 -07008822 maxchar = Py_MAX(maxchar, fixed);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008823 PyUnicode_WRITE(kind, data, i, fixed);
8824 }
Victor Stinnere6abb482012-05-02 01:15:40 +02008825 else
Benjamin Peterson7e303732013-06-10 09:19:46 -07008826 maxchar = Py_MAX(maxchar, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008827 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008828 }
8829
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008830 return (modified) ? maxchar : 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008831}
8832
8833PyObject *
8834_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8835{
8836 if (!PyUnicode_Check(unicode)) {
8837 PyErr_BadInternalCall();
8838 return NULL;
8839 }
8840 if (PyUnicode_READY(unicode) == -1)
8841 return NULL;
8842 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8843 /* If the string is already ASCII, just return the same string */
8844 Py_INCREF(unicode);
8845 return unicode;
8846 }
Victor Stinner9310abb2011-10-05 00:59:23 +02008847 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008848}
8849
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008850PyObject *
8851PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8852 Py_ssize_t length)
8853{
Victor Stinnerf0124502011-11-21 23:12:56 +01008854 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008855 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01008856 Py_UCS4 maxchar;
8857 enum PyUnicode_Kind kind;
8858 void *data;
8859
Victor Stinner99d7ad02012-02-22 13:37:39 +01008860 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008861 for (i = 0; i < length; i++) {
Victor Stinnerf0124502011-11-21 23:12:56 +01008862 Py_UNICODE ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008863 if (ch > 127) {
8864 int decimal = Py_UNICODE_TODECIMAL(ch);
8865 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01008866 ch = '0' + decimal;
Benjamin Peterson7e303732013-06-10 09:19:46 -07008867 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008868 }
8869 }
Victor Stinnerf0124502011-11-21 23:12:56 +01008870
8871 /* Copy to a new string */
8872 decimal = PyUnicode_New(length, maxchar);
8873 if (decimal == NULL)
8874 return decimal;
8875 kind = PyUnicode_KIND(decimal);
8876 data = PyUnicode_DATA(decimal);
8877 /* Iterate over code points */
8878 for (i = 0; i < length; i++) {
8879 Py_UNICODE ch = s[i];
8880 if (ch > 127) {
8881 int decimal = Py_UNICODE_TODECIMAL(ch);
8882 if (decimal >= 0)
8883 ch = '0' + decimal;
8884 }
8885 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008886 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008887 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008888}
Guido van Rossum9e896b32000-04-05 20:11:21 +00008889/* --- Decimal Encoder ---------------------------------------------------- */
8890
Alexander Belopolsky40018472011-02-26 01:02:56 +00008891int
8892PyUnicode_EncodeDecimal(Py_UNICODE *s,
8893 Py_ssize_t length,
8894 char *output,
8895 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00008896{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008897 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01008898 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01008899 enum PyUnicode_Kind kind;
8900 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008901
8902 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008903 PyErr_BadArgument();
8904 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008905 }
8906
Victor Stinner42bf7752011-11-21 22:52:58 +01008907 unicode = PyUnicode_FromUnicode(s, length);
8908 if (unicode == NULL)
8909 return -1;
8910
Benjamin Petersonbac79492012-01-14 13:34:47 -05008911 if (PyUnicode_READY(unicode) == -1) {
Victor Stinner6345be92011-11-25 20:09:01 +01008912 Py_DECREF(unicode);
8913 return -1;
8914 }
Victor Stinner42bf7752011-11-21 22:52:58 +01008915 kind = PyUnicode_KIND(unicode);
8916 data = PyUnicode_DATA(unicode);
8917
Victor Stinnerb84d7232011-11-22 01:50:07 +01008918 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01008919 PyObject *exc;
8920 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00008921 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01008922 Py_ssize_t startpos;
8923
8924 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00008925
Benjamin Peterson29060642009-01-31 22:14:21 +00008926 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008927 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01008928 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008929 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008930 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008931 decimal = Py_UNICODE_TODECIMAL(ch);
8932 if (decimal >= 0) {
8933 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008934 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008935 continue;
8936 }
8937 if (0 < ch && ch < 256) {
8938 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008939 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008940 continue;
8941 }
Victor Stinner6345be92011-11-25 20:09:01 +01008942
Victor Stinner42bf7752011-11-21 22:52:58 +01008943 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01008944 exc = NULL;
8945 raise_encode_exception(&exc, "decimal", unicode,
8946 startpos, startpos+1,
8947 "invalid decimal Unicode string");
8948 Py_XDECREF(exc);
8949 Py_DECREF(unicode);
8950 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008951 }
8952 /* 0-terminate the output string */
8953 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01008954 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008955 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008956}
8957
Guido van Rossumd57fd912000-03-10 22:53:23 +00008958/* --- Helpers ------------------------------------------------------------ */
8959
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008960static Py_ssize_t
Victor Stinner794d5672011-10-10 03:21:36 +02008961any_find_slice(int direction, PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008962 Py_ssize_t start,
8963 Py_ssize_t end)
8964{
8965 int kind1, kind2, kind;
8966 void *buf1, *buf2;
8967 Py_ssize_t len1, len2, result;
8968
8969 kind1 = PyUnicode_KIND(s1);
8970 kind2 = PyUnicode_KIND(s2);
8971 kind = kind1 > kind2 ? kind1 : kind2;
8972 buf1 = PyUnicode_DATA(s1);
8973 buf2 = PyUnicode_DATA(s2);
8974 if (kind1 != kind)
8975 buf1 = _PyUnicode_AsKind(s1, kind);
8976 if (!buf1)
8977 return -2;
8978 if (kind2 != kind)
8979 buf2 = _PyUnicode_AsKind(s2, kind);
8980 if (!buf2) {
8981 if (kind1 != kind) PyMem_Free(buf1);
8982 return -2;
8983 }
8984 len1 = PyUnicode_GET_LENGTH(s1);
8985 len2 = PyUnicode_GET_LENGTH(s2);
8986
Victor Stinner794d5672011-10-10 03:21:36 +02008987 if (direction > 0) {
Benjamin Petersonead6b532011-12-20 17:23:42 -06008988 switch (kind) {
Victor Stinner794d5672011-10-10 03:21:36 +02008989 case PyUnicode_1BYTE_KIND:
8990 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8991 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
8992 else
8993 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
8994 break;
8995 case PyUnicode_2BYTE_KIND:
8996 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
8997 break;
8998 case PyUnicode_4BYTE_KIND:
8999 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9000 break;
9001 default:
9002 assert(0); result = -2;
9003 }
9004 }
9005 else {
Benjamin Petersonead6b532011-12-20 17:23:42 -06009006 switch (kind) {
Victor Stinner794d5672011-10-10 03:21:36 +02009007 case PyUnicode_1BYTE_KIND:
9008 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9009 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9010 else
9011 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9012 break;
9013 case PyUnicode_2BYTE_KIND:
9014 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9015 break;
9016 case PyUnicode_4BYTE_KIND:
9017 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9018 break;
9019 default:
9020 assert(0); result = -2;
9021 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009022 }
9023
9024 if (kind1 != kind)
9025 PyMem_Free(buf1);
9026 if (kind2 != kind)
9027 PyMem_Free(buf2);
9028
9029 return result;
9030}
9031
9032Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01009033_PyUnicode_InsertThousandsGrouping(
9034 PyObject *unicode, Py_ssize_t index,
9035 Py_ssize_t n_buffer,
9036 void *digits, Py_ssize_t n_digits,
9037 Py_ssize_t min_width,
9038 const char *grouping, PyObject *thousands_sep,
9039 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009040{
Victor Stinner41a863c2012-02-24 00:37:51 +01009041 unsigned int kind, thousands_sep_kind;
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009042 char *data, *thousands_sep_data;
Victor Stinner41a863c2012-02-24 00:37:51 +01009043 Py_ssize_t thousands_sep_len;
9044 Py_ssize_t len;
9045
9046 if (unicode != NULL) {
9047 kind = PyUnicode_KIND(unicode);
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009048 data = (char *) PyUnicode_DATA(unicode) + index * kind;
Victor Stinner41a863c2012-02-24 00:37:51 +01009049 }
9050 else {
9051 kind = PyUnicode_1BYTE_KIND;
9052 data = NULL;
9053 }
9054 thousands_sep_kind = PyUnicode_KIND(thousands_sep);
9055 thousands_sep_data = PyUnicode_DATA(thousands_sep);
9056 thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9057 if (unicode != NULL && thousands_sep_kind != kind) {
Victor Stinner90f50d42012-02-24 01:44:47 +01009058 if (thousands_sep_kind < kind) {
9059 thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind);
9060 if (!thousands_sep_data)
9061 return -1;
9062 }
9063 else {
9064 data = _PyUnicode_AsKind(unicode, thousands_sep_kind);
9065 if (!data)
9066 return -1;
9067 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009068 }
9069
Benjamin Petersonead6b532011-12-20 17:23:42 -06009070 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009071 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009072 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
Victor Stinner41a863c2012-02-24 00:37:51 +01009073 len = asciilib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009074 (Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009075 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009076 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009077 else
Victor Stinner41a863c2012-02-24 00:37:51 +01009078 len = ucs1lib_InsertThousandsGrouping(
Victor Stinnerc3cec782011-10-05 21:24:08 +02009079 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009080 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009081 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009082 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009083 case PyUnicode_2BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009084 len = ucs2lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009085 (Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009086 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009087 (Py_UCS2 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009088 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009089 case PyUnicode_4BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009090 len = ucs4lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009091 (Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009092 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009093 (Py_UCS4 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009094 break;
9095 default:
9096 assert(0);
9097 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009098 }
Victor Stinner90f50d42012-02-24 01:44:47 +01009099 if (unicode != NULL && thousands_sep_kind != kind) {
9100 if (thousands_sep_kind < kind)
9101 PyMem_Free(thousands_sep_data);
9102 else
9103 PyMem_Free(data);
9104 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009105 if (unicode == NULL) {
9106 *maxchar = 127;
9107 if (len != n_digits) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009108 *maxchar = Py_MAX(*maxchar,
Victor Stinnere6abb482012-05-02 01:15:40 +02009109 PyUnicode_MAX_CHAR_VALUE(thousands_sep));
Victor Stinner41a863c2012-02-24 00:37:51 +01009110 }
9111 }
9112 return len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009113}
9114
9115
Thomas Wouters477c8d52006-05-27 19:21:47 +00009116/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009117#define ADJUST_INDICES(start, end, len) \
9118 if (end > len) \
9119 end = len; \
9120 else if (end < 0) { \
9121 end += len; \
9122 if (end < 0) \
9123 end = 0; \
9124 } \
9125 if (start < 0) { \
9126 start += len; \
9127 if (start < 0) \
9128 start = 0; \
9129 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009130
Alexander Belopolsky40018472011-02-26 01:02:56 +00009131Py_ssize_t
9132PyUnicode_Count(PyObject *str,
9133 PyObject *substr,
9134 Py_ssize_t start,
9135 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009136{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009137 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009138 PyObject* str_obj;
9139 PyObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009140 int kind1, kind2, kind;
9141 void *buf1 = NULL, *buf2 = NULL;
9142 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009143
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009144 str_obj = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009145 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00009146 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009147 sub_obj = PyUnicode_FromObject(substr);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009148 if (!sub_obj) {
9149 Py_DECREF(str_obj);
9150 return -1;
9151 }
Benjamin Peterson4c13a4a2012-01-02 09:07:38 -06009152 if (PyUnicode_READY(sub_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
Benjamin Peterson5e458f52012-01-02 10:12:13 -06009153 Py_DECREF(sub_obj);
Benjamin Peterson29060642009-01-31 22:14:21 +00009154 Py_DECREF(str_obj);
9155 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009156 }
Tim Petersced69f82003-09-16 20:30:58 +00009157
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009158 kind1 = PyUnicode_KIND(str_obj);
9159 kind2 = PyUnicode_KIND(sub_obj);
Antoine Pitroue45c0c52012-05-12 15:49:07 +02009160 kind = kind1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009161 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009162 buf2 = PyUnicode_DATA(sub_obj);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -05009163 if (kind2 != kind) {
Antoine Pitrou758153b2012-05-12 15:51:51 +02009164 if (kind2 > kind) {
9165 Py_DECREF(sub_obj);
9166 Py_DECREF(str_obj);
Antoine Pitroue45c0c52012-05-12 15:49:07 +02009167 return 0;
Antoine Pitrou758153b2012-05-12 15:51:51 +02009168 }
Victor Stinner7931d9a2011-11-04 00:22:48 +01009169 buf2 = _PyUnicode_AsKind(sub_obj, kind);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -05009170 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009171 if (!buf2)
9172 goto onError;
9173 len1 = PyUnicode_GET_LENGTH(str_obj);
9174 len2 = PyUnicode_GET_LENGTH(sub_obj);
9175
9176 ADJUST_INDICES(start, end, len1);
Benjamin Petersonead6b532011-12-20 17:23:42 -06009177 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009178 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009179 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
9180 result = asciilib_count(
9181 ((Py_UCS1*)buf1) + start, end - start,
9182 buf2, len2, PY_SSIZE_T_MAX
9183 );
9184 else
9185 result = ucs1lib_count(
9186 ((Py_UCS1*)buf1) + start, end - start,
9187 buf2, len2, PY_SSIZE_T_MAX
9188 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009189 break;
9190 case PyUnicode_2BYTE_KIND:
9191 result = ucs2lib_count(
9192 ((Py_UCS2*)buf1) + start, end - start,
9193 buf2, len2, PY_SSIZE_T_MAX
9194 );
9195 break;
9196 case PyUnicode_4BYTE_KIND:
9197 result = ucs4lib_count(
9198 ((Py_UCS4*)buf1) + start, end - start,
9199 buf2, len2, PY_SSIZE_T_MAX
9200 );
9201 break;
9202 default:
9203 assert(0); result = 0;
9204 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009205
9206 Py_DECREF(sub_obj);
9207 Py_DECREF(str_obj);
9208
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009209 if (kind2 != kind)
9210 PyMem_Free(buf2);
9211
Guido van Rossumd57fd912000-03-10 22:53:23 +00009212 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009213 onError:
9214 Py_DECREF(sub_obj);
9215 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009216 if (kind2 != kind && buf2)
9217 PyMem_Free(buf2);
9218 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009219}
9220
Alexander Belopolsky40018472011-02-26 01:02:56 +00009221Py_ssize_t
9222PyUnicode_Find(PyObject *str,
9223 PyObject *sub,
9224 Py_ssize_t start,
9225 Py_ssize_t end,
9226 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009227{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009228 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009229
Guido van Rossumd57fd912000-03-10 22:53:23 +00009230 str = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009231 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00009232 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009233 sub = PyUnicode_FromObject(sub);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009234 if (!sub) {
9235 Py_DECREF(str);
9236 return -2;
9237 }
9238 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
9239 Py_DECREF(sub);
Benjamin Peterson29060642009-01-31 22:14:21 +00009240 Py_DECREF(str);
9241 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009242 }
Tim Petersced69f82003-09-16 20:30:58 +00009243
Victor Stinner794d5672011-10-10 03:21:36 +02009244 result = any_find_slice(direction,
9245 str, sub, start, end
9246 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00009247
Guido van Rossumd57fd912000-03-10 22:53:23 +00009248 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009249 Py_DECREF(sub);
9250
Guido van Rossumd57fd912000-03-10 22:53:23 +00009251 return result;
9252}
9253
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009254Py_ssize_t
9255PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9256 Py_ssize_t start, Py_ssize_t end,
9257 int direction)
9258{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009259 int kind;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009260 Py_ssize_t result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009261 if (PyUnicode_READY(str) == -1)
9262 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02009263 if (start < 0 || end < 0) {
9264 PyErr_SetString(PyExc_IndexError, "string index out of range");
9265 return -2;
9266 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009267 if (end > PyUnicode_GET_LENGTH(str))
9268 end = PyUnicode_GET_LENGTH(str);
9269 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009270 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9271 kind, end-start, ch, direction);
9272 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009273 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009274 else
9275 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009276}
9277
Alexander Belopolsky40018472011-02-26 01:02:56 +00009278static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009279tailmatch(PyObject *self,
9280 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009281 Py_ssize_t start,
9282 Py_ssize_t end,
9283 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009284{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009285 int kind_self;
9286 int kind_sub;
9287 void *data_self;
9288 void *data_sub;
9289 Py_ssize_t offset;
9290 Py_ssize_t i;
9291 Py_ssize_t end_sub;
9292
9293 if (PyUnicode_READY(self) == -1 ||
9294 PyUnicode_READY(substring) == -1)
Victor Stinner18aa4472013-01-03 03:18:09 +01009295 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009296
9297 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009298 return 1;
9299
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009300 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9301 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009302 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009303 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009304
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009305 kind_self = PyUnicode_KIND(self);
9306 data_self = PyUnicode_DATA(self);
9307 kind_sub = PyUnicode_KIND(substring);
9308 data_sub = PyUnicode_DATA(substring);
9309 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9310
9311 if (direction > 0)
9312 offset = end;
9313 else
9314 offset = start;
9315
9316 if (PyUnicode_READ(kind_self, data_self, offset) ==
9317 PyUnicode_READ(kind_sub, data_sub, 0) &&
9318 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9319 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9320 /* If both are of the same kind, memcmp is sufficient */
9321 if (kind_self == kind_sub) {
9322 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009323 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009324 data_sub,
9325 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009326 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009327 }
9328 /* otherwise we have to compare each character by first accesing it */
9329 else {
9330 /* We do not need to compare 0 and len(substring)-1 because
9331 the if statement above ensured already that they are equal
9332 when we end up here. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009333 for (i = 1; i < end_sub; ++i) {
9334 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9335 PyUnicode_READ(kind_sub, data_sub, i))
9336 return 0;
9337 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009338 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009339 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009340 }
9341
9342 return 0;
9343}
9344
Alexander Belopolsky40018472011-02-26 01:02:56 +00009345Py_ssize_t
9346PyUnicode_Tailmatch(PyObject *str,
9347 PyObject *substr,
9348 Py_ssize_t start,
9349 Py_ssize_t end,
9350 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009351{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009352 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009353
Guido van Rossumd57fd912000-03-10 22:53:23 +00009354 str = PyUnicode_FromObject(str);
9355 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009356 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009357 substr = PyUnicode_FromObject(substr);
9358 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009359 Py_DECREF(str);
9360 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009361 }
Tim Petersced69f82003-09-16 20:30:58 +00009362
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009363 result = tailmatch(str, substr,
Benjamin Peterson29060642009-01-31 22:14:21 +00009364 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009365 Py_DECREF(str);
9366 Py_DECREF(substr);
9367 return result;
9368}
9369
Guido van Rossumd57fd912000-03-10 22:53:23 +00009370/* Apply fixfct filter to the Unicode object self and return a
9371 reference to the modified object */
9372
Alexander Belopolsky40018472011-02-26 01:02:56 +00009373static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009374fixup(PyObject *self,
9375 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009376{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009377 PyObject *u;
9378 Py_UCS4 maxchar_old, maxchar_new = 0;
Victor Stinnereaab6042011-12-11 22:22:39 +01009379 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009380
Victor Stinnerbf6e5602011-12-12 01:53:47 +01009381 u = _PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009382 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009383 return NULL;
Victor Stinner87af4f22011-11-21 23:03:47 +01009384 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009385
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009386 /* fix functions return the new maximum character in a string,
9387 if the kind of the resulting unicode object does not change,
9388 everything is fine. Otherwise we need to change the string kind
9389 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009390 maxchar_new = fixfct(u);
Victor Stinnereaab6042011-12-11 22:22:39 +01009391
9392 if (maxchar_new == 0) {
9393 /* no changes */;
9394 if (PyUnicode_CheckExact(self)) {
9395 Py_DECREF(u);
9396 Py_INCREF(self);
9397 return self;
9398 }
9399 else
9400 return u;
9401 }
9402
Victor Stinnere6abb482012-05-02 01:15:40 +02009403 maxchar_new = align_maxchar(maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009404
Victor Stinnereaab6042011-12-11 22:22:39 +01009405 if (maxchar_new == maxchar_old)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009406 return u;
Victor Stinnereaab6042011-12-11 22:22:39 +01009407
9408 /* In case the maximum character changed, we need to
9409 convert the string to the new category. */
9410 v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
9411 if (v == NULL) {
9412 Py_DECREF(u);
9413 return NULL;
9414 }
9415 if (maxchar_new > maxchar_old) {
9416 /* If the maxchar increased so that the kind changed, not all
9417 characters are representable anymore and we need to fix the
9418 string again. This only happens in very few cases. */
Victor Stinnerd3f08822012-05-29 12:57:52 +02009419 _PyUnicode_FastCopyCharacters(v, 0,
9420 self, 0, PyUnicode_GET_LENGTH(self));
Victor Stinnereaab6042011-12-11 22:22:39 +01009421 maxchar_old = fixfct(v);
9422 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009423 }
9424 else {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009425 _PyUnicode_FastCopyCharacters(v, 0,
9426 u, 0, PyUnicode_GET_LENGTH(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009427 }
Victor Stinnereaab6042011-12-11 22:22:39 +01009428 Py_DECREF(u);
9429 assert(_PyUnicode_CheckConsistency(v, 1));
9430 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009431}
9432
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009433static PyObject *
9434ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009435{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009436 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9437 char *resdata, *data = PyUnicode_DATA(self);
9438 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009439
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009440 res = PyUnicode_New(len, 127);
9441 if (res == NULL)
9442 return NULL;
9443 resdata = PyUnicode_DATA(res);
9444 if (lower)
9445 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009446 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009447 _Py_bytes_upper(resdata, data, len);
9448 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009449}
9450
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009451static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009452handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009453{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009454 Py_ssize_t j;
9455 int final_sigma;
9456 Py_UCS4 c;
9457 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009458
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009459 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9460
9461 where ! is a negation and \p{xxx} is a character with property xxx.
9462 */
9463 for (j = i - 1; j >= 0; j--) {
9464 c = PyUnicode_READ(kind, data, j);
9465 if (!_PyUnicode_IsCaseIgnorable(c))
9466 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009467 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009468 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9469 if (final_sigma) {
9470 for (j = i + 1; j < length; j++) {
9471 c = PyUnicode_READ(kind, data, j);
9472 if (!_PyUnicode_IsCaseIgnorable(c))
9473 break;
9474 }
9475 final_sigma = j == length || !_PyUnicode_IsCased(c);
9476 }
9477 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009478}
9479
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009480static int
9481lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9482 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009483{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009484 /* Obscure special case. */
9485 if (c == 0x3A3) {
9486 mapped[0] = handle_capital_sigma(kind, data, length, i);
9487 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009488 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009489 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009490}
9491
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009492static Py_ssize_t
9493do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009494{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009495 Py_ssize_t i, k = 0;
9496 int n_res, j;
9497 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009498
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009499 c = PyUnicode_READ(kind, data, 0);
9500 n_res = _PyUnicode_ToUpperFull(c, mapped);
9501 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009502 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009503 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009504 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009505 for (i = 1; i < length; i++) {
9506 c = PyUnicode_READ(kind, data, i);
9507 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9508 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009509 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009510 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009511 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009512 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009513 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009514}
9515
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009516static Py_ssize_t
9517do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9518 Py_ssize_t i, k = 0;
9519
9520 for (i = 0; i < length; i++) {
9521 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9522 int n_res, j;
9523 if (Py_UNICODE_ISUPPER(c)) {
9524 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9525 }
9526 else if (Py_UNICODE_ISLOWER(c)) {
9527 n_res = _PyUnicode_ToUpperFull(c, mapped);
9528 }
9529 else {
9530 n_res = 1;
9531 mapped[0] = c;
9532 }
9533 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009534 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009535 res[k++] = mapped[j];
9536 }
9537 }
9538 return k;
9539}
9540
9541static Py_ssize_t
9542do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9543 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009544{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009545 Py_ssize_t i, k = 0;
9546
9547 for (i = 0; i < length; i++) {
9548 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9549 int n_res, j;
9550 if (lower)
9551 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9552 else
9553 n_res = _PyUnicode_ToUpperFull(c, mapped);
9554 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009555 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009556 res[k++] = mapped[j];
9557 }
9558 }
9559 return k;
9560}
9561
9562static Py_ssize_t
9563do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9564{
9565 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9566}
9567
9568static Py_ssize_t
9569do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9570{
9571 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9572}
9573
Benjamin Petersone51757f2012-01-12 21:10:29 -05009574static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009575do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9576{
9577 Py_ssize_t i, k = 0;
9578
9579 for (i = 0; i < length; i++) {
9580 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9581 Py_UCS4 mapped[3];
9582 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9583 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009584 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -05009585 res[k++] = mapped[j];
9586 }
9587 }
9588 return k;
9589}
9590
9591static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009592do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9593{
9594 Py_ssize_t i, k = 0;
9595 int previous_is_cased;
9596
9597 previous_is_cased = 0;
9598 for (i = 0; i < length; i++) {
9599 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9600 Py_UCS4 mapped[3];
9601 int n_res, j;
9602
9603 if (previous_is_cased)
9604 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9605 else
9606 n_res = _PyUnicode_ToTitleFull(c, mapped);
9607
9608 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009609 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -05009610 res[k++] = mapped[j];
9611 }
9612
9613 previous_is_cased = _PyUnicode_IsCased(c);
9614 }
9615 return k;
9616}
9617
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009618static PyObject *
9619case_operation(PyObject *self,
9620 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9621{
9622 PyObject *res = NULL;
9623 Py_ssize_t length, newlength = 0;
9624 int kind, outkind;
9625 void *data, *outdata;
9626 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9627
Benjamin Petersoneea48462012-01-16 14:28:50 -05009628 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009629
9630 kind = PyUnicode_KIND(self);
9631 data = PyUnicode_DATA(self);
9632 length = PyUnicode_GET_LENGTH(self);
9633 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
9634 if (tmp == NULL)
9635 return PyErr_NoMemory();
9636 newlength = perform(kind, data, length, tmp, &maxchar);
9637 res = PyUnicode_New(newlength, maxchar);
9638 if (res == NULL)
9639 goto leave;
9640 tmpend = tmp + newlength;
9641 outdata = PyUnicode_DATA(res);
9642 outkind = PyUnicode_KIND(res);
9643 switch (outkind) {
9644 case PyUnicode_1BYTE_KIND:
9645 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9646 break;
9647 case PyUnicode_2BYTE_KIND:
9648 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9649 break;
9650 case PyUnicode_4BYTE_KIND:
9651 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9652 break;
9653 default:
9654 assert(0);
9655 break;
9656 }
9657 leave:
9658 PyMem_FREE(tmp);
9659 return res;
9660}
9661
Tim Peters8ce9f162004-08-27 01:49:32 +00009662PyObject *
9663PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009664{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009665 PyObject *sep = NULL;
Victor Stinnerdd077322011-10-07 17:02:31 +02009666 Py_ssize_t seplen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009667 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00009668 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009669 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
9670 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00009671 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009672 Py_ssize_t sz, i, res_offset;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009673 Py_UCS4 maxchar;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009674 Py_UCS4 item_maxchar;
Victor Stinnerdd077322011-10-07 17:02:31 +02009675 int use_memcpy;
9676 unsigned char *res_data = NULL, *sep_data = NULL;
9677 PyObject *last_obj;
9678 unsigned int kind = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009679
Tim Peters05eba1f2004-08-27 21:32:02 +00009680 fseq = PySequence_Fast(seq, "");
9681 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009682 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009683 }
9684
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009685 /* NOTE: the following code can't call back into Python code,
9686 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009687 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009688
Tim Peters05eba1f2004-08-27 21:32:02 +00009689 seqlen = PySequence_Fast_GET_SIZE(fseq);
9690 /* If empty sequence, return u"". */
9691 if (seqlen == 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009692 Py_DECREF(fseq);
Serhiy Storchaka678db842013-01-26 12:16:36 +02009693 _Py_RETURN_UNICODE_EMPTY();
Tim Peters05eba1f2004-08-27 21:32:02 +00009694 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009695
Tim Peters05eba1f2004-08-27 21:32:02 +00009696 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009697 last_obj = NULL;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009698 items = PySequence_Fast_ITEMS(fseq);
Victor Stinneracf47b82011-10-06 12:32:37 +02009699 if (seqlen == 1) {
9700 if (PyUnicode_CheckExact(items[0])) {
9701 res = items[0];
9702 Py_INCREF(res);
9703 Py_DECREF(fseq);
9704 return res;
9705 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009706 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009707 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009708 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009709 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009710 /* Set up sep and seplen */
9711 if (separator == NULL) {
9712 /* fall back to a blank space separator */
9713 sep = PyUnicode_FromOrdinal(' ');
9714 if (!sep)
9715 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009716 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009717 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009718 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009719 else {
9720 if (!PyUnicode_Check(separator)) {
9721 PyErr_Format(PyExc_TypeError,
9722 "separator: expected str instance,"
9723 " %.80s found",
9724 Py_TYPE(separator)->tp_name);
9725 goto onError;
9726 }
9727 if (PyUnicode_READY(separator))
9728 goto onError;
9729 sep = separator;
9730 seplen = PyUnicode_GET_LENGTH(separator);
9731 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9732 /* inc refcount to keep this code path symmetric with the
9733 above case of a blank separator */
9734 Py_INCREF(sep);
9735 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009736 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009737 }
9738
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009739 /* There are at least two things to join, or else we have a subclass
9740 * of str in the sequence.
9741 * Do a pre-pass to figure out the total amount of space we'll
9742 * need (sz), and see whether all argument are strings.
9743 */
9744 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009745#ifdef Py_DEBUG
9746 use_memcpy = 0;
9747#else
9748 use_memcpy = 1;
9749#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009750 for (i = 0; i < seqlen; i++) {
9751 const Py_ssize_t old_sz = sz;
9752 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009753 if (!PyUnicode_Check(item)) {
9754 PyErr_Format(PyExc_TypeError,
9755 "sequence item %zd: expected str instance,"
9756 " %.80s found",
9757 i, Py_TYPE(item)->tp_name);
9758 goto onError;
9759 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009760 if (PyUnicode_READY(item) == -1)
9761 goto onError;
9762 sz += PyUnicode_GET_LENGTH(item);
9763 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Benjamin Peterson7e303732013-06-10 09:19:46 -07009764 maxchar = Py_MAX(maxchar, item_maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009765 if (i != 0)
9766 sz += seplen;
9767 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9768 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009769 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009770 goto onError;
9771 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009772 if (use_memcpy && last_obj != NULL) {
9773 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9774 use_memcpy = 0;
9775 }
9776 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009777 }
Tim Petersced69f82003-09-16 20:30:58 +00009778
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009779 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009780 if (res == NULL)
9781 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009782
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009783 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009784#ifdef Py_DEBUG
9785 use_memcpy = 0;
9786#else
9787 if (use_memcpy) {
9788 res_data = PyUnicode_1BYTE_DATA(res);
9789 kind = PyUnicode_KIND(res);
9790 if (seplen != 0)
9791 sep_data = PyUnicode_1BYTE_DATA(sep);
9792 }
9793#endif
Victor Stinner4560f9c2013-04-14 18:56:46 +02009794 if (use_memcpy) {
9795 for (i = 0; i < seqlen; ++i) {
9796 Py_ssize_t itemlen;
9797 item = items[i];
9798
9799 /* Copy item, and maybe the separator. */
9800 if (i && seplen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009801 Py_MEMCPY(res_data,
9802 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009803 kind * seplen);
9804 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009805 }
Victor Stinner4560f9c2013-04-14 18:56:46 +02009806
9807 itemlen = PyUnicode_GET_LENGTH(item);
9808 if (itemlen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009809 Py_MEMCPY(res_data,
9810 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009811 kind * itemlen);
9812 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009813 }
Victor Stinner4560f9c2013-04-14 18:56:46 +02009814 }
9815 assert(res_data == PyUnicode_1BYTE_DATA(res)
9816 + kind * PyUnicode_GET_LENGTH(res));
9817 }
9818 else {
9819 for (i = 0, res_offset = 0; i < seqlen; ++i) {
9820 Py_ssize_t itemlen;
9821 item = items[i];
9822
9823 /* Copy item, and maybe the separator. */
9824 if (i && seplen != 0) {
9825 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
9826 res_offset += seplen;
9827 }
9828
9829 itemlen = PyUnicode_GET_LENGTH(item);
9830 if (itemlen != 0) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009831 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +02009832 res_offset += itemlen;
9833 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009834 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009835 assert(res_offset == PyUnicode_GET_LENGTH(res));
Victor Stinner4560f9c2013-04-14 18:56:46 +02009836 }
Tim Peters8ce9f162004-08-27 01:49:32 +00009837
Tim Peters05eba1f2004-08-27 21:32:02 +00009838 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009839 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009840 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009841 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009842
Benjamin Peterson29060642009-01-31 22:14:21 +00009843 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00009844 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009845 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00009846 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009847 return NULL;
9848}
9849
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009850#define FILL(kind, data, value, start, length) \
9851 do { \
9852 Py_ssize_t i_ = 0; \
9853 assert(kind != PyUnicode_WCHAR_KIND); \
9854 switch ((kind)) { \
9855 case PyUnicode_1BYTE_KIND: { \
9856 unsigned char * to_ = (unsigned char *)((data)) + (start); \
Victor Stinnerf2c76aa2012-05-03 13:10:40 +02009857 memset(to_, (unsigned char)value, (length)); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009858 break; \
9859 } \
9860 case PyUnicode_2BYTE_KIND: { \
9861 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
9862 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9863 break; \
9864 } \
Benjamin Petersone157cf12012-01-01 15:56:20 -06009865 case PyUnicode_4BYTE_KIND: { \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009866 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9867 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9868 break; \
Benjamin Petersone157cf12012-01-01 15:56:20 -06009869 default: assert(0); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009870 } \
9871 } \
9872 } while (0)
9873
Victor Stinnerd3f08822012-05-29 12:57:52 +02009874void
9875_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9876 Py_UCS4 fill_char)
9877{
9878 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
9879 const void *data = PyUnicode_DATA(unicode);
9880 assert(PyUnicode_IS_READY(unicode));
9881 assert(unicode_modifiable(unicode));
9882 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
9883 assert(start >= 0);
9884 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
9885 FILL(kind, data, fill_char, start, length);
9886}
9887
Victor Stinner3fe55312012-01-04 00:33:50 +01009888Py_ssize_t
9889PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9890 Py_UCS4 fill_char)
9891{
9892 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +01009893
9894 if (!PyUnicode_Check(unicode)) {
9895 PyErr_BadInternalCall();
9896 return -1;
9897 }
9898 if (PyUnicode_READY(unicode) == -1)
9899 return -1;
9900 if (unicode_check_modifiable(unicode))
9901 return -1;
9902
Victor Stinnerd3f08822012-05-29 12:57:52 +02009903 if (start < 0) {
9904 PyErr_SetString(PyExc_IndexError, "string index out of range");
9905 return -1;
9906 }
Victor Stinner3fe55312012-01-04 00:33:50 +01009907 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
9908 PyErr_SetString(PyExc_ValueError,
9909 "fill character is bigger than "
9910 "the string maximum character");
9911 return -1;
9912 }
9913
9914 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
9915 length = Py_MIN(maxlen, length);
9916 if (length <= 0)
9917 return 0;
9918
Victor Stinnerd3f08822012-05-29 12:57:52 +02009919 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +01009920 return length;
9921}
9922
Victor Stinner9310abb2011-10-05 00:59:23 +02009923static PyObject *
9924pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009925 Py_ssize_t left,
9926 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009927 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009928{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009929 PyObject *u;
9930 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009931 int kind;
9932 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009933
9934 if (left < 0)
9935 left = 0;
9936 if (right < 0)
9937 right = 0;
9938
Victor Stinnerc4b49542011-12-11 22:44:26 +01009939 if (left == 0 && right == 0)
9940 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009941
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009942 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9943 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00009944 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9945 return NULL;
9946 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009947 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Benjamin Peterson7e303732013-06-10 09:19:46 -07009948 maxchar = Py_MAX(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009949 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009950 if (!u)
9951 return NULL;
9952
9953 kind = PyUnicode_KIND(u);
9954 data = PyUnicode_DATA(u);
9955 if (left)
9956 FILL(kind, data, fill, 0, left);
9957 if (right)
9958 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +02009959 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009960 assert(_PyUnicode_CheckConsistency(u, 1));
9961 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009962}
9963
Alexander Belopolsky40018472011-02-26 01:02:56 +00009964PyObject *
9965PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009966{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009967 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009968
9969 string = PyUnicode_FromObject(string);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009970 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009971 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -06009972 if (PyUnicode_READY(string) == -1) {
9973 Py_DECREF(string);
9974 return NULL;
9975 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009976
Benjamin Petersonead6b532011-12-20 17:23:42 -06009977 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009978 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009979 if (PyUnicode_IS_ASCII(string))
9980 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009981 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009982 PyUnicode_GET_LENGTH(string), keepends);
9983 else
9984 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009985 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009986 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009987 break;
9988 case PyUnicode_2BYTE_KIND:
9989 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009990 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009991 PyUnicode_GET_LENGTH(string), keepends);
9992 break;
9993 case PyUnicode_4BYTE_KIND:
9994 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009995 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009996 PyUnicode_GET_LENGTH(string), keepends);
9997 break;
9998 default:
9999 assert(0);
10000 list = 0;
10001 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010002 Py_DECREF(string);
10003 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010004}
10005
Alexander Belopolsky40018472011-02-26 01:02:56 +000010006static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010007split(PyObject *self,
10008 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010009 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010010{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010011 int kind1, kind2, kind;
10012 void *buf1, *buf2;
10013 Py_ssize_t len1, len2;
10014 PyObject* out;
10015
Guido van Rossumd57fd912000-03-10 22:53:23 +000010016 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010017 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010018
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010019 if (PyUnicode_READY(self) == -1)
10020 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010021
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010022 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010023 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010024 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010025 if (PyUnicode_IS_ASCII(self))
10026 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010027 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010028 PyUnicode_GET_LENGTH(self), maxcount
10029 );
10030 else
10031 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010032 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010033 PyUnicode_GET_LENGTH(self), maxcount
10034 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010035 case PyUnicode_2BYTE_KIND:
10036 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010037 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010038 PyUnicode_GET_LENGTH(self), maxcount
10039 );
10040 case PyUnicode_4BYTE_KIND:
10041 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010042 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010043 PyUnicode_GET_LENGTH(self), maxcount
10044 );
10045 default:
10046 assert(0);
10047 return NULL;
10048 }
10049
10050 if (PyUnicode_READY(substring) == -1)
10051 return NULL;
10052
10053 kind1 = PyUnicode_KIND(self);
10054 kind2 = PyUnicode_KIND(substring);
10055 kind = kind1 > kind2 ? kind1 : kind2;
10056 buf1 = PyUnicode_DATA(self);
10057 buf2 = PyUnicode_DATA(substring);
10058 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010059 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010060 if (!buf1)
10061 return NULL;
10062 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010063 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010064 if (!buf2) {
10065 if (kind1 != kind) PyMem_Free(buf1);
10066 return NULL;
10067 }
10068 len1 = PyUnicode_GET_LENGTH(self);
10069 len2 = PyUnicode_GET_LENGTH(substring);
10070
Benjamin Petersonead6b532011-12-20 17:23:42 -060010071 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010072 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010073 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10074 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010075 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010076 else
10077 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010078 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010079 break;
10080 case PyUnicode_2BYTE_KIND:
10081 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010082 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010083 break;
10084 case PyUnicode_4BYTE_KIND:
10085 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010086 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010087 break;
10088 default:
10089 out = NULL;
10090 }
10091 if (kind1 != kind)
10092 PyMem_Free(buf1);
10093 if (kind2 != kind)
10094 PyMem_Free(buf2);
10095 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010096}
10097
Alexander Belopolsky40018472011-02-26 01:02:56 +000010098static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010099rsplit(PyObject *self,
10100 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010101 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010102{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010103 int kind1, kind2, kind;
10104 void *buf1, *buf2;
10105 Py_ssize_t len1, len2;
10106 PyObject* out;
10107
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010108 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010109 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010110
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010111 if (PyUnicode_READY(self) == -1)
10112 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010113
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010114 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010115 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010116 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010117 if (PyUnicode_IS_ASCII(self))
10118 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010119 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010120 PyUnicode_GET_LENGTH(self), maxcount
10121 );
10122 else
10123 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010124 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010125 PyUnicode_GET_LENGTH(self), maxcount
10126 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010127 case PyUnicode_2BYTE_KIND:
10128 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010129 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010130 PyUnicode_GET_LENGTH(self), maxcount
10131 );
10132 case PyUnicode_4BYTE_KIND:
10133 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010134 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010135 PyUnicode_GET_LENGTH(self), maxcount
10136 );
10137 default:
10138 assert(0);
10139 return NULL;
10140 }
10141
10142 if (PyUnicode_READY(substring) == -1)
10143 return NULL;
10144
10145 kind1 = PyUnicode_KIND(self);
10146 kind2 = PyUnicode_KIND(substring);
10147 kind = kind1 > kind2 ? kind1 : kind2;
10148 buf1 = PyUnicode_DATA(self);
10149 buf2 = PyUnicode_DATA(substring);
10150 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010151 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010152 if (!buf1)
10153 return NULL;
10154 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010155 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010156 if (!buf2) {
10157 if (kind1 != kind) PyMem_Free(buf1);
10158 return NULL;
10159 }
10160 len1 = PyUnicode_GET_LENGTH(self);
10161 len2 = PyUnicode_GET_LENGTH(substring);
10162
Benjamin Petersonead6b532011-12-20 17:23:42 -060010163 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010164 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010165 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10166 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010167 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010168 else
10169 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010170 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010171 break;
10172 case PyUnicode_2BYTE_KIND:
10173 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010174 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010175 break;
10176 case PyUnicode_4BYTE_KIND:
10177 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010178 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010179 break;
10180 default:
10181 out = NULL;
10182 }
10183 if (kind1 != kind)
10184 PyMem_Free(buf1);
10185 if (kind2 != kind)
10186 PyMem_Free(buf2);
10187 return out;
10188}
10189
10190static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010191anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10192 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010193{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010194 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010195 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010196 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10197 return asciilib_find(buf1, len1, buf2, len2, offset);
10198 else
10199 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010200 case PyUnicode_2BYTE_KIND:
10201 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10202 case PyUnicode_4BYTE_KIND:
10203 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10204 }
10205 assert(0);
10206 return -1;
10207}
10208
10209static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010210anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10211 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010212{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010213 switch (kind) {
10214 case PyUnicode_1BYTE_KIND:
10215 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10216 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10217 else
10218 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10219 case PyUnicode_2BYTE_KIND:
10220 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10221 case PyUnicode_4BYTE_KIND:
10222 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10223 }
10224 assert(0);
10225 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010226}
10227
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010228static void
10229replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10230 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10231{
10232 int kind = PyUnicode_KIND(u);
10233 void *data = PyUnicode_DATA(u);
10234 Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10235 if (kind == PyUnicode_1BYTE_KIND) {
10236 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10237 (Py_UCS1 *)data + len,
10238 u1, u2, maxcount);
10239 }
10240 else if (kind == PyUnicode_2BYTE_KIND) {
10241 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10242 (Py_UCS2 *)data + len,
10243 u1, u2, maxcount);
10244 }
10245 else {
10246 assert(kind == PyUnicode_4BYTE_KIND);
10247 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10248 (Py_UCS4 *)data + len,
10249 u1, u2, maxcount);
10250 }
10251}
10252
Alexander Belopolsky40018472011-02-26 01:02:56 +000010253static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010254replace(PyObject *self, PyObject *str1,
10255 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010256{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010257 PyObject *u;
10258 char *sbuf = PyUnicode_DATA(self);
10259 char *buf1 = PyUnicode_DATA(str1);
10260 char *buf2 = PyUnicode_DATA(str2);
10261 int srelease = 0, release1 = 0, release2 = 0;
10262 int skind = PyUnicode_KIND(self);
10263 int kind1 = PyUnicode_KIND(str1);
10264 int kind2 = PyUnicode_KIND(str2);
10265 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10266 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10267 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010268 int mayshrink;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010269 Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010270
10271 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010272 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010273 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010274 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010275
Victor Stinner59de0ee2011-10-07 10:01:28 +020010276 if (str1 == str2)
10277 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010278
Victor Stinner49a0a212011-10-12 23:46:10 +020010279 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010280 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10281 if (maxchar < maxchar_str1)
10282 /* substring too wide to be present */
10283 goto nothing;
Victor Stinner49a0a212011-10-12 23:46:10 +020010284 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10285 /* Replacing str1 with str2 may cause a maxchar reduction in the
10286 result string. */
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010287 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010288 maxchar = Py_MAX(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010289
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010290 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010291 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010292 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010293 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010294 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010295 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010296 Py_UCS4 u1, u2;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010297 Py_ssize_t pos;
Victor Stinnerf6441102011-12-18 02:43:08 +010010298
Victor Stinner69ed0f42013-04-09 21:48:24 +020010299 u1 = PyUnicode_READ(kind1, buf1, 0);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010300 pos = findchar(sbuf, skind, slen, u1, 1);
Victor Stinnerf6441102011-12-18 02:43:08 +010010301 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010302 goto nothing;
Victor Stinner69ed0f42013-04-09 21:48:24 +020010303 u2 = PyUnicode_READ(kind2, buf2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010304 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010305 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010306 goto error;
Victor Stinnerf6441102011-12-18 02:43:08 +010010307
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010308 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10309 replace_1char_inplace(u, pos, u1, u2, maxcount);
Victor Stinner49a0a212011-10-12 23:46:10 +020010310 }
10311 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010312 int rkind = skind;
10313 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010314 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010315
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010316 if (kind1 < rkind) {
10317 /* widen substring */
10318 buf1 = _PyUnicode_AsKind(str1, rkind);
10319 if (!buf1) goto error;
10320 release1 = 1;
10321 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010322 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010323 if (i < 0)
10324 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010325 if (rkind > kind2) {
10326 /* widen replacement */
10327 buf2 = _PyUnicode_AsKind(str2, rkind);
10328 if (!buf2) goto error;
10329 release2 = 1;
10330 }
10331 else if (rkind < kind2) {
10332 /* widen self and buf1 */
10333 rkind = kind2;
10334 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010335 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010336 sbuf = _PyUnicode_AsKind(self, rkind);
10337 if (!sbuf) goto error;
10338 srelease = 1;
10339 buf1 = _PyUnicode_AsKind(str1, rkind);
10340 if (!buf1) goto error;
10341 release1 = 1;
10342 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010343 u = PyUnicode_New(slen, maxchar);
10344 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010345 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010346 assert(PyUnicode_KIND(u) == rkind);
10347 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010348
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010349 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010350 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010351 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010352 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010353 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010354 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010355
10356 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010357 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010358 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010359 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010360 if (i == -1)
10361 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010362 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010363 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010364 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010365 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010366 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010367 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010368 }
10369 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010370 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010371 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010372 int rkind = skind;
10373 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010374
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010375 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010376 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010377 buf1 = _PyUnicode_AsKind(str1, rkind);
10378 if (!buf1) goto error;
10379 release1 = 1;
10380 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010381 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010382 if (n == 0)
10383 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010384 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010385 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010386 buf2 = _PyUnicode_AsKind(str2, rkind);
10387 if (!buf2) goto error;
10388 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010389 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010390 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010391 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010392 rkind = kind2;
10393 sbuf = _PyUnicode_AsKind(self, rkind);
10394 if (!sbuf) goto error;
10395 srelease = 1;
10396 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010397 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010398 buf1 = _PyUnicode_AsKind(str1, rkind);
10399 if (!buf1) goto error;
10400 release1 = 1;
10401 }
10402 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10403 PyUnicode_GET_LENGTH(str1))); */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010404 if (len2 > len1 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010405 PyErr_SetString(PyExc_OverflowError,
10406 "replace string is too long");
10407 goto error;
10408 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010409 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010410 if (new_size == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010411 _Py_INCREF_UNICODE_EMPTY();
10412 if (!unicode_empty)
10413 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010414 u = unicode_empty;
10415 goto done;
10416 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010417 if (new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010418 PyErr_SetString(PyExc_OverflowError,
10419 "replace string is too long");
10420 goto error;
10421 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010422 u = PyUnicode_New(new_size, maxchar);
10423 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010424 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010425 assert(PyUnicode_KIND(u) == rkind);
10426 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010427 ires = i = 0;
10428 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010429 while (n-- > 0) {
10430 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010431 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010432 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010433 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010434 if (j == -1)
10435 break;
10436 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010437 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010438 memcpy(res + rkind * ires,
10439 sbuf + rkind * i,
10440 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010441 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010442 }
10443 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010444 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010445 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010446 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010447 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010448 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010449 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010450 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010451 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010452 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010453 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010454 memcpy(res + rkind * ires,
10455 sbuf + rkind * i,
10456 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010457 }
10458 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010459 /* interleave */
10460 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010461 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010462 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010463 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010464 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010465 if (--n <= 0)
10466 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010467 memcpy(res + rkind * ires,
10468 sbuf + rkind * i,
10469 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010470 ires++;
10471 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010472 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010473 memcpy(res + rkind * ires,
10474 sbuf + rkind * i,
10475 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010476 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010477 }
10478
10479 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010480 unicode_adjust_maxchar(&u);
10481 if (u == NULL)
10482 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010483 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010484
10485 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010486 if (srelease)
10487 PyMem_FREE(sbuf);
10488 if (release1)
10489 PyMem_FREE(buf1);
10490 if (release2)
10491 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010492 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010493 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010494
Benjamin Peterson29060642009-01-31 22:14:21 +000010495 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010496 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010497 if (srelease)
10498 PyMem_FREE(sbuf);
10499 if (release1)
10500 PyMem_FREE(buf1);
10501 if (release2)
10502 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010503 return unicode_result_unchanged(self);
10504
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010505 error:
10506 if (srelease && sbuf)
10507 PyMem_FREE(sbuf);
10508 if (release1 && buf1)
10509 PyMem_FREE(buf1);
10510 if (release2 && buf2)
10511 PyMem_FREE(buf2);
10512 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010513}
10514
10515/* --- Unicode Object Methods --------------------------------------------- */
10516
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010517PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010518 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010519\n\
10520Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010521characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010522
10523static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010524unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010525{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010526 if (PyUnicode_READY(self) == -1)
10527 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010528 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010529}
10530
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010531PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010532 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010533\n\
10534Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +000010535have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010536
10537static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010538unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010539{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010540 if (PyUnicode_READY(self) == -1)
10541 return NULL;
10542 if (PyUnicode_GET_LENGTH(self) == 0)
10543 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010544 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010545}
10546
Benjamin Petersond5890c82012-01-14 13:23:30 -050010547PyDoc_STRVAR(casefold__doc__,
10548 "S.casefold() -> str\n\
10549\n\
10550Return a version of S suitable for caseless comparisons.");
10551
10552static PyObject *
10553unicode_casefold(PyObject *self)
10554{
10555 if (PyUnicode_READY(self) == -1)
10556 return NULL;
10557 if (PyUnicode_IS_ASCII(self))
10558 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010559 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010560}
10561
10562
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010563/* Argument converter. Coerces to a single unicode character */
10564
10565static int
10566convert_uc(PyObject *obj, void *addr)
10567{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010568 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010569 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010570
Benjamin Peterson14339b62009-01-31 16:36:08 +000010571 uniobj = PyUnicode_FromObject(obj);
10572 if (uniobj == NULL) {
10573 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010574 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010575 return 0;
10576 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010577 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010578 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010579 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010580 Py_DECREF(uniobj);
10581 return 0;
10582 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010583 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010584 Py_DECREF(uniobj);
10585 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010586}
10587
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010588PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010589 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010590\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010591Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010592done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010593
10594static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010595unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010596{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010597 Py_ssize_t marg, left;
10598 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010599 Py_UCS4 fillchar = ' ';
10600
Victor Stinnere9a29352011-10-01 02:14:59 +020010601 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010602 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010603
Benjamin Petersonbac79492012-01-14 13:34:47 -050010604 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010605 return NULL;
10606
Victor Stinnerc4b49542011-12-11 22:44:26 +010010607 if (PyUnicode_GET_LENGTH(self) >= width)
10608 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010609
Victor Stinnerc4b49542011-12-11 22:44:26 +010010610 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010611 left = marg / 2 + (marg & width & 1);
10612
Victor Stinner9310abb2011-10-05 00:59:23 +020010613 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010614}
10615
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010616/* This function assumes that str1 and str2 are readied by the caller. */
10617
Marc-André Lemburge5034372000-08-08 08:04:29 +000010618static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010619unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010620{
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010621#define COMPARE(TYPE1, TYPE2) \
10622 do { \
10623 TYPE1* p1 = (TYPE1 *)data1; \
10624 TYPE2* p2 = (TYPE2 *)data2; \
10625 TYPE1* end = p1 + len; \
10626 Py_UCS4 c1, c2; \
10627 for (; p1 != end; p1++, p2++) { \
10628 c1 = *p1; \
10629 c2 = *p2; \
10630 if (c1 != c2) \
10631 return (c1 < c2) ? -1 : 1; \
10632 } \
10633 } \
10634 while (0)
10635
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010636 int kind1, kind2;
10637 void *data1, *data2;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010638 Py_ssize_t len1, len2, len;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010639
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010640 kind1 = PyUnicode_KIND(str1);
10641 kind2 = PyUnicode_KIND(str2);
10642 data1 = PyUnicode_DATA(str1);
10643 data2 = PyUnicode_DATA(str2);
10644 len1 = PyUnicode_GET_LENGTH(str1);
10645 len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner770e19e2012-10-04 22:59:45 +020010646 len = Py_MIN(len1, len2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010647
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010648 switch(kind1) {
10649 case PyUnicode_1BYTE_KIND:
10650 {
10651 switch(kind2) {
10652 case PyUnicode_1BYTE_KIND:
10653 {
10654 int cmp = memcmp(data1, data2, len);
10655 /* normalize result of memcmp() into the range [-1; 1] */
10656 if (cmp < 0)
10657 return -1;
10658 if (cmp > 0)
10659 return 1;
10660 break;
Victor Stinner770e19e2012-10-04 22:59:45 +020010661 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010662 case PyUnicode_2BYTE_KIND:
10663 COMPARE(Py_UCS1, Py_UCS2);
10664 break;
10665 case PyUnicode_4BYTE_KIND:
10666 COMPARE(Py_UCS1, Py_UCS4);
10667 break;
10668 default:
10669 assert(0);
10670 }
10671 break;
10672 }
10673 case PyUnicode_2BYTE_KIND:
10674 {
10675 switch(kind2) {
10676 case PyUnicode_1BYTE_KIND:
10677 COMPARE(Py_UCS2, Py_UCS1);
10678 break;
10679 case PyUnicode_2BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010680 {
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010681 COMPARE(Py_UCS2, Py_UCS2);
10682 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010683 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010684 case PyUnicode_4BYTE_KIND:
10685 COMPARE(Py_UCS2, Py_UCS4);
10686 break;
10687 default:
10688 assert(0);
10689 }
10690 break;
10691 }
10692 case PyUnicode_4BYTE_KIND:
10693 {
10694 switch(kind2) {
10695 case PyUnicode_1BYTE_KIND:
10696 COMPARE(Py_UCS4, Py_UCS1);
10697 break;
10698 case PyUnicode_2BYTE_KIND:
10699 COMPARE(Py_UCS4, Py_UCS2);
10700 break;
10701 case PyUnicode_4BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010702 {
10703#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
10704 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
10705 /* normalize result of wmemcmp() into the range [-1; 1] */
10706 if (cmp < 0)
10707 return -1;
10708 if (cmp > 0)
10709 return 1;
10710#else
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010711 COMPARE(Py_UCS4, Py_UCS4);
Victor Stinnercd777ea2013-04-08 22:43:44 +020010712#endif
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010713 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010714 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010715 default:
10716 assert(0);
10717 }
10718 break;
10719 }
10720 default:
10721 assert(0);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010722 }
10723
Victor Stinner770e19e2012-10-04 22:59:45 +020010724 if (len1 == len2)
10725 return 0;
10726 if (len1 < len2)
10727 return -1;
10728 else
10729 return 1;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010730
10731#undef COMPARE
Marc-André Lemburge5034372000-08-08 08:04:29 +000010732}
10733
Victor Stinnerc8bc5372013-11-04 11:08:10 +010010734Py_LOCAL(int)
Victor Stinnere5567ad2012-10-23 02:48:49 +020010735unicode_compare_eq(PyObject *str1, PyObject *str2)
10736{
10737 int kind;
10738 void *data1, *data2;
10739 Py_ssize_t len;
10740 int cmp;
10741
Victor Stinnere5567ad2012-10-23 02:48:49 +020010742 len = PyUnicode_GET_LENGTH(str1);
10743 if (PyUnicode_GET_LENGTH(str2) != len)
10744 return 0;
10745 kind = PyUnicode_KIND(str1);
10746 if (PyUnicode_KIND(str2) != kind)
10747 return 0;
10748 data1 = PyUnicode_DATA(str1);
10749 data2 = PyUnicode_DATA(str2);
10750
10751 cmp = memcmp(data1, data2, len * kind);
10752 return (cmp == 0);
10753}
10754
10755
Alexander Belopolsky40018472011-02-26 01:02:56 +000010756int
10757PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010758{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010759 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10760 if (PyUnicode_READY(left) == -1 ||
10761 PyUnicode_READY(right) == -1)
10762 return -1;
Victor Stinnerf0c7b2a2013-11-04 11:27:14 +010010763
10764 /* a string is equal to itself */
10765 if (left == right)
10766 return 0;
10767
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010768 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010769 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010770 PyErr_Format(PyExc_TypeError,
10771 "Can't compare %.100s and %.100s",
10772 left->ob_type->tp_name,
10773 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010774 return -1;
10775}
10776
Martin v. Löwis5b222132007-06-10 09:51:05 +000010777int
Victor Stinnerad14ccd2013-11-07 00:46:04 +010010778_PyUnicode_CompareWithId(PyObject *left, _Py_Identifier *right)
10779{
10780 PyObject *right_str = _PyUnicode_FromId(right); /* borrowed */
10781 if (right_str == NULL)
10782 return -1;
10783 return PyUnicode_Compare(left, right_str);
10784}
10785
10786int
Martin v. Löwis5b222132007-06-10 09:51:05 +000010787PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10788{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010789 Py_ssize_t i;
10790 int kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010791 Py_UCS4 chr;
10792
Victor Stinner910337b2011-10-03 03:20:16 +020010793 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010794 if (PyUnicode_READY(uni) == -1)
10795 return -1;
10796 kind = PyUnicode_KIND(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010010797 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnera6b9b072013-10-30 18:27:13 +010010798 const void *data = PyUnicode_1BYTE_DATA(uni);
Victor Stinnere1b15922013-11-03 13:53:12 +010010799 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010010800 size_t len, len2 = strlen(str);
10801 int cmp;
10802
10803 len = Py_MIN(len1, len2);
10804 cmp = memcmp(data, str, len);
Victor Stinner21ea21e2013-11-04 11:28:26 +010010805 if (cmp != 0) {
10806 if (cmp < 0)
10807 return -1;
10808 else
10809 return 1;
10810 }
Victor Stinner602f7cf2013-10-29 23:31:50 +010010811 if (len1 > len2)
10812 return 1; /* uni is longer */
10813 if (len2 > len1)
10814 return -1; /* str is longer */
10815 return 0;
10816 }
10817 else {
10818 void *data = PyUnicode_DATA(uni);
10819 /* Compare Unicode string and source character set string */
10820 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
10821 if (chr != str[i])
10822 return (chr < (unsigned char)(str[i])) ? -1 : 1;
10823 /* This check keeps Python strings that end in '\0' from comparing equal
10824 to C strings identical up to that point. */
10825 if (PyUnicode_GET_LENGTH(uni) != i || chr)
10826 return 1; /* uni is longer */
10827 if (str[i])
10828 return -1; /* str is longer */
10829 return 0;
10830 }
Martin v. Löwis5b222132007-06-10 09:51:05 +000010831}
10832
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010833
Benjamin Peterson29060642009-01-31 22:14:21 +000010834#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000010835 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010836
Alexander Belopolsky40018472011-02-26 01:02:56 +000010837PyObject *
10838PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010839{
10840 int result;
Victor Stinnere5567ad2012-10-23 02:48:49 +020010841 PyObject *v;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010842
Victor Stinnere5567ad2012-10-23 02:48:49 +020010843 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
10844 Py_RETURN_NOTIMPLEMENTED;
10845
10846 if (PyUnicode_READY(left) == -1 ||
10847 PyUnicode_READY(right) == -1)
10848 return NULL;
10849
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010010850 if (left == right) {
10851 switch (op) {
10852 case Py_EQ:
10853 case Py_LE:
10854 case Py_GE:
10855 /* a string is equal to itself */
10856 v = Py_True;
10857 break;
10858 case Py_NE:
10859 case Py_LT:
10860 case Py_GT:
10861 v = Py_False;
10862 break;
10863 default:
10864 PyErr_BadArgument();
10865 return NULL;
10866 }
10867 }
10868 else if (op == Py_EQ || op == Py_NE) {
Victor Stinnere5567ad2012-10-23 02:48:49 +020010869 result = unicode_compare_eq(left, right);
Victor Stinnerc8bc5372013-11-04 11:08:10 +010010870 result ^= (op == Py_NE);
10871 v = TEST_COND(result);
Victor Stinnere5567ad2012-10-23 02:48:49 +020010872 }
10873 else {
Victor Stinner90db9c42012-10-04 21:53:50 +020010874 result = unicode_compare(left, right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010875
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010876 /* Convert the return value to a Boolean */
10877 switch (op) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010878 case Py_LE:
10879 v = TEST_COND(result <= 0);
10880 break;
10881 case Py_GE:
10882 v = TEST_COND(result >= 0);
10883 break;
10884 case Py_LT:
10885 v = TEST_COND(result == -1);
10886 break;
10887 case Py_GT:
10888 v = TEST_COND(result == 1);
10889 break;
10890 default:
10891 PyErr_BadArgument();
10892 return NULL;
10893 }
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010894 }
Victor Stinnere5567ad2012-10-23 02:48:49 +020010895 Py_INCREF(v);
10896 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010897}
10898
Alexander Belopolsky40018472011-02-26 01:02:56 +000010899int
10900PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +000010901{
Thomas Wouters477c8d52006-05-27 19:21:47 +000010902 PyObject *str, *sub;
Victor Stinner77282cb2013-04-14 19:22:47 +020010903 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010904 void *buf1, *buf2;
10905 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010906 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010907
10908 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +000010909 sub = PyUnicode_FromObject(element);
10910 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010911 PyErr_Format(PyExc_TypeError,
10912 "'in <string>' requires string as left operand, not %s",
10913 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010914 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010915 }
10916
Thomas Wouters477c8d52006-05-27 19:21:47 +000010917 str = PyUnicode_FromObject(container);
Benjamin Peterson22a29702012-01-02 09:00:30 -060010918 if (!str) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010919 Py_DECREF(sub);
10920 return -1;
10921 }
10922
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010923 kind1 = PyUnicode_KIND(str);
10924 kind2 = PyUnicode_KIND(sub);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010925 buf1 = PyUnicode_DATA(str);
10926 buf2 = PyUnicode_DATA(sub);
Victor Stinner77282cb2013-04-14 19:22:47 +020010927 if (kind2 != kind1) {
10928 if (kind2 > kind1) {
Antoine Pitrou758153b2012-05-12 15:51:51 +020010929 Py_DECREF(sub);
10930 Py_DECREF(str);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010931 return 0;
Antoine Pitrou758153b2012-05-12 15:51:51 +020010932 }
Victor Stinner77282cb2013-04-14 19:22:47 +020010933 buf2 = _PyUnicode_AsKind(sub, kind1);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010934 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010935 if (!buf2) {
10936 Py_DECREF(sub);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010937 Py_DECREF(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010938 return -1;
10939 }
10940 len1 = PyUnicode_GET_LENGTH(str);
10941 len2 = PyUnicode_GET_LENGTH(sub);
10942
Victor Stinner77282cb2013-04-14 19:22:47 +020010943 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010944 case PyUnicode_1BYTE_KIND:
10945 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10946 break;
10947 case PyUnicode_2BYTE_KIND:
10948 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10949 break;
10950 case PyUnicode_4BYTE_KIND:
10951 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
10952 break;
10953 default:
10954 result = -1;
10955 assert(0);
10956 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010957
10958 Py_DECREF(str);
10959 Py_DECREF(sub);
10960
Victor Stinner77282cb2013-04-14 19:22:47 +020010961 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010962 PyMem_Free(buf2);
10963
Guido van Rossum403d68b2000-03-13 15:55:09 +000010964 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010965}
10966
Guido van Rossumd57fd912000-03-10 22:53:23 +000010967/* Concat to string or Unicode object giving a new Unicode object. */
10968
Alexander Belopolsky40018472011-02-26 01:02:56 +000010969PyObject *
10970PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010971{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010972 PyObject *u = NULL, *v = NULL, *w;
Victor Stinner127226b2011-10-13 01:12:34 +020010973 Py_UCS4 maxchar, maxchar2;
Victor Stinner488fa492011-12-12 00:01:39 +010010974 Py_ssize_t u_len, v_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010975
10976 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010977 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010978 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010979 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010980 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010981 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010982 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010983
10984 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +020010985 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010986 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010987 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010988 }
Victor Stinnera464fc12011-10-02 20:39:30 +020010989 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010990 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010991 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010992 }
10993
Victor Stinner488fa492011-12-12 00:01:39 +010010994 u_len = PyUnicode_GET_LENGTH(u);
10995 v_len = PyUnicode_GET_LENGTH(v);
10996 if (u_len > PY_SSIZE_T_MAX - v_len) {
10997 PyErr_SetString(PyExc_OverflowError,
10998 "strings are too large to concat");
10999 goto onError;
11000 }
11001 new_len = u_len + v_len;
11002
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011003 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinner127226b2011-10-13 01:12:34 +020011004 maxchar2 = PyUnicode_MAX_CHAR_VALUE(v);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011005 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011006
Guido van Rossumd57fd912000-03-10 22:53:23 +000011007 /* Concat the two Unicode strings */
Victor Stinner488fa492011-12-12 00:01:39 +010011008 w = PyUnicode_New(new_len, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011009 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000011010 goto onError;
Victor Stinnerd3f08822012-05-29 12:57:52 +020011011 _PyUnicode_FastCopyCharacters(w, 0, u, 0, u_len);
11012 _PyUnicode_FastCopyCharacters(w, u_len, v, 0, v_len);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011013 Py_DECREF(u);
11014 Py_DECREF(v);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011015 assert(_PyUnicode_CheckConsistency(w, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011016 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011017
Benjamin Peterson29060642009-01-31 22:14:21 +000011018 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000011019 Py_XDECREF(u);
11020 Py_XDECREF(v);
11021 return NULL;
11022}
11023
Walter Dörwald1ab83302007-05-18 17:15:44 +000011024void
Victor Stinner23e56682011-10-03 03:54:37 +020011025PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000011026{
Victor Stinner23e56682011-10-03 03:54:37 +020011027 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010011028 Py_UCS4 maxchar, maxchar2;
11029 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020011030
11031 if (p_left == NULL) {
11032 if (!PyErr_Occurred())
11033 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000011034 return;
11035 }
Victor Stinner23e56682011-10-03 03:54:37 +020011036 left = *p_left;
Victor Stinnerf0335102013-04-14 19:13:03 +020011037 if (right == NULL || left == NULL
11038 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
Victor Stinner23e56682011-10-03 03:54:37 +020011039 if (!PyErr_Occurred())
11040 PyErr_BadInternalCall();
11041 goto error;
11042 }
11043
Benjamin Petersonbac79492012-01-14 13:34:47 -050011044 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011045 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050011046 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011047 goto error;
11048
Victor Stinner488fa492011-12-12 00:01:39 +010011049 /* Shortcuts */
11050 if (left == unicode_empty) {
11051 Py_DECREF(left);
11052 Py_INCREF(right);
11053 *p_left = right;
11054 return;
11055 }
11056 if (right == unicode_empty)
11057 return;
11058
11059 left_len = PyUnicode_GET_LENGTH(left);
11060 right_len = PyUnicode_GET_LENGTH(right);
11061 if (left_len > PY_SSIZE_T_MAX - right_len) {
11062 PyErr_SetString(PyExc_OverflowError,
11063 "strings are too large to concat");
11064 goto error;
11065 }
11066 new_len = left_len + right_len;
11067
11068 if (unicode_modifiable(left)
11069 && PyUnicode_CheckExact(right)
11070 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020011071 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11072 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020011073 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020011074 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010011075 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11076 {
11077 /* append inplace */
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011078 if (unicode_resize(p_left, new_len) != 0)
Victor Stinner488fa492011-12-12 00:01:39 +010011079 goto error;
Victor Stinnerf0335102013-04-14 19:13:03 +020011080
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011081 /* copy 'right' into the newly allocated area of 'left' */
11082 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020011083 }
Victor Stinner488fa492011-12-12 00:01:39 +010011084 else {
11085 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11086 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011087 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020011088
Victor Stinner488fa492011-12-12 00:01:39 +010011089 /* Concat the two Unicode strings */
11090 res = PyUnicode_New(new_len, maxchar);
11091 if (res == NULL)
11092 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020011093 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11094 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010011095 Py_DECREF(left);
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011096 *p_left = res;
Victor Stinner488fa492011-12-12 00:01:39 +010011097 }
11098 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020011099 return;
11100
11101error:
Victor Stinner488fa492011-12-12 00:01:39 +010011102 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011103}
11104
11105void
11106PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11107{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011108 PyUnicode_Append(pleft, right);
11109 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011110}
11111
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011112PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011113 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011114\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011115Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011116string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011117interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011118
11119static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011120unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011121{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011122 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011123 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011124 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011125 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011126 int kind1, kind2, kind;
11127 void *buf1, *buf2;
11128 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011129
Jesus Ceaac451502011-04-20 17:09:23 +020011130 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
11131 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011132 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000011133
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011134 kind1 = PyUnicode_KIND(self);
11135 kind2 = PyUnicode_KIND(substring);
Christian Heimesd47802e2013-06-29 21:33:36 +020011136 if (kind2 > kind1) {
11137 Py_DECREF(substring);
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040011138 return PyLong_FromLong(0);
Christian Heimesd47802e2013-06-29 21:33:36 +020011139 }
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040011140 kind = kind1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011141 buf1 = PyUnicode_DATA(self);
11142 buf2 = PyUnicode_DATA(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011143 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010011144 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011145 if (!buf2) {
11146 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011147 return NULL;
11148 }
11149 len1 = PyUnicode_GET_LENGTH(self);
11150 len2 = PyUnicode_GET_LENGTH(substring);
11151
11152 ADJUST_INDICES(start, end, len1);
Benjamin Petersonead6b532011-12-20 17:23:42 -060011153 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011154 case PyUnicode_1BYTE_KIND:
11155 iresult = ucs1lib_count(
11156 ((Py_UCS1*)buf1) + start, end - start,
11157 buf2, len2, PY_SSIZE_T_MAX
11158 );
11159 break;
11160 case PyUnicode_2BYTE_KIND:
11161 iresult = ucs2lib_count(
11162 ((Py_UCS2*)buf1) + start, end - start,
11163 buf2, len2, PY_SSIZE_T_MAX
11164 );
11165 break;
11166 case PyUnicode_4BYTE_KIND:
11167 iresult = ucs4lib_count(
11168 ((Py_UCS4*)buf1) + start, end - start,
11169 buf2, len2, PY_SSIZE_T_MAX
11170 );
11171 break;
11172 default:
11173 assert(0); iresult = 0;
11174 }
11175
11176 result = PyLong_FromSsize_t(iresult);
11177
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011178 if (kind2 != kind)
11179 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011180
11181 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011182
Guido van Rossumd57fd912000-03-10 22:53:23 +000011183 return result;
11184}
11185
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011186PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000011187 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011188\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000011189Encode S using the codec registered for encoding. Default encoding\n\
11190is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000011191handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000011192a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
11193'xmlcharrefreplace' as well as any other name registered with\n\
11194codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011195
11196static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011197unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011198{
Benjamin Peterson308d6372009-09-18 21:42:35 +000011199 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000011200 char *encoding = NULL;
11201 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000011202
Benjamin Peterson308d6372009-09-18 21:42:35 +000011203 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
11204 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011205 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011206 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011207}
11208
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011209PyDoc_STRVAR(expandtabs__doc__,
Ezio Melotti745d54d2013-11-16 19:10:57 +020011210 "S.expandtabs(tabsize=8) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011211\n\
11212Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011213If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011214
11215static PyObject*
Ezio Melotti745d54d2013-11-16 19:10:57 +020011216unicode_expandtabs(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011217{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011218 Py_ssize_t i, j, line_pos, src_len, incr;
11219 Py_UCS4 ch;
11220 PyObject *u;
11221 void *src_data, *dest_data;
Ezio Melotti745d54d2013-11-16 19:10:57 +020011222 static char *kwlist[] = {"tabsize", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000011223 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011224 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011225 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011226
Ezio Melotti745d54d2013-11-16 19:10:57 +020011227 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:expandtabs",
11228 kwlist, &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000011229 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011230
Antoine Pitrou22425222011-10-04 19:10:51 +020011231 if (PyUnicode_READY(self) == -1)
11232 return NULL;
11233
Thomas Wouters7e474022000-07-16 12:04:32 +000011234 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011235 src_len = PyUnicode_GET_LENGTH(self);
11236 i = j = line_pos = 0;
11237 kind = PyUnicode_KIND(self);
11238 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011239 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011240 for (; i < src_len; i++) {
11241 ch = PyUnicode_READ(kind, src_data, i);
11242 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011243 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011244 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011245 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011246 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011247 goto overflow;
11248 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011249 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011250 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011251 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011252 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011253 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011254 goto overflow;
11255 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011256 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011257 if (ch == '\n' || ch == '\r')
11258 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011259 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011260 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011261 if (!found)
11262 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011263
Guido van Rossumd57fd912000-03-10 22:53:23 +000011264 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011265 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011266 if (!u)
11267 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011268 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011269
Antoine Pitroue71d5742011-10-04 15:55:09 +020011270 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011271
Antoine Pitroue71d5742011-10-04 15:55:09 +020011272 for (; i < src_len; i++) {
11273 ch = PyUnicode_READ(kind, src_data, i);
11274 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011275 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011276 incr = tabsize - (line_pos % tabsize);
11277 line_pos += incr;
Victor Stinnerda79e632012-02-22 13:37:04 +010011278 FILL(kind, dest_data, ' ', j, incr);
11279 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011280 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011281 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011282 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011283 line_pos++;
11284 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011285 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011286 if (ch == '\n' || ch == '\r')
11287 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011288 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011289 }
11290 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011291 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011292
Antoine Pitroue71d5742011-10-04 15:55:09 +020011293 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011294 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11295 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011296}
11297
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011298PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011299 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011300\n\
11301Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011302such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011303arguments start and end are interpreted as in slice notation.\n\
11304\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011305Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011306
11307static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011308unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011309{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011310 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011311 Py_ssize_t start;
11312 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011313 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011314
Jesus Ceaac451502011-04-20 17:09:23 +020011315 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
11316 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011317 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011318
Christian Heimesd47802e2013-06-29 21:33:36 +020011319 if (PyUnicode_READY(self) == -1) {
11320 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011321 return NULL;
Christian Heimesd47802e2013-06-29 21:33:36 +020011322 }
11323 if (PyUnicode_READY(substring) == -1) {
11324 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011325 return NULL;
Christian Heimesd47802e2013-06-29 21:33:36 +020011326 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011327
Victor Stinner7931d9a2011-11-04 00:22:48 +010011328 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011329
11330 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011331
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011332 if (result == -2)
11333 return NULL;
11334
Christian Heimes217cfd12007-12-02 14:31:20 +000011335 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011336}
11337
11338static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011339unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011340{
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011341 void *data;
11342 enum PyUnicode_Kind kind;
11343 Py_UCS4 ch;
11344 PyObject *res;
11345
11346 if (!PyUnicode_Check(self) || PyUnicode_READY(self) == -1) {
11347 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011348 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011349 }
11350 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11351 PyErr_SetString(PyExc_IndexError, "string index out of range");
11352 return NULL;
11353 }
11354 kind = PyUnicode_KIND(self);
11355 data = PyUnicode_DATA(self);
11356 ch = PyUnicode_READ(kind, data, index);
11357 if (ch < 256)
11358 return get_latin1_char(ch);
11359
11360 res = PyUnicode_New(1, ch);
11361 if (res == NULL)
11362 return NULL;
11363 kind = PyUnicode_KIND(res);
11364 data = PyUnicode_DATA(res);
11365 PyUnicode_WRITE(kind, data, 0, ch);
11366 assert(_PyUnicode_CheckConsistency(res, 1));
11367 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011368}
11369
Guido van Rossumc2504932007-09-18 19:42:40 +000011370/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011371 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011372static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011373unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011374{
Guido van Rossumc2504932007-09-18 19:42:40 +000011375 Py_ssize_t len;
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080011376 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000011377
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011378#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011379 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011380#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011381 if (_PyUnicode_HASH(self) != -1)
11382 return _PyUnicode_HASH(self);
11383 if (PyUnicode_READY(self) == -1)
11384 return -1;
11385 len = PyUnicode_GET_LENGTH(self);
Georg Brandl16fa2a12012-02-21 00:50:13 +010011386 /*
11387 We make the hash of the empty string be 0, rather than using
11388 (prefix ^ suffix), since this slightly obfuscates the hash secret
11389 */
11390 if (len == 0) {
11391 _PyUnicode_HASH(self) = 0;
11392 return 0;
11393 }
Christian Heimes985ecdc2013-11-20 11:46:18 +010011394 x = _Py_HashBytes(PyUnicode_DATA(self),
11395 PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011396 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011397 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011398}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011399#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +000011400
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011401PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011402 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011403\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011404Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011405
11406static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011407unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011408{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011409 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011410 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011411 Py_ssize_t start;
11412 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011413
Jesus Ceaac451502011-04-20 17:09:23 +020011414 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
11415 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011416 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011417
Christian Heimesd47a0452013-06-29 21:21:37 +020011418 if (PyUnicode_READY(self) == -1) {
11419 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011420 return NULL;
Christian Heimesd47a0452013-06-29 21:21:37 +020011421 }
11422 if (PyUnicode_READY(substring) == -1) {
11423 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011424 return NULL;
Christian Heimesd47a0452013-06-29 21:21:37 +020011425 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011426
Victor Stinner7931d9a2011-11-04 00:22:48 +010011427 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011428
11429 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011430
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011431 if (result == -2)
11432 return NULL;
11433
Guido van Rossumd57fd912000-03-10 22:53:23 +000011434 if (result < 0) {
11435 PyErr_SetString(PyExc_ValueError, "substring not found");
11436 return NULL;
11437 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011438
Christian Heimes217cfd12007-12-02 14:31:20 +000011439 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011440}
11441
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011442PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011443 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011444\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011445Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011446at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011447
11448static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011449unicode_islower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011450{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011451 Py_ssize_t i, length;
11452 int kind;
11453 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011454 int cased;
11455
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011456 if (PyUnicode_READY(self) == -1)
11457 return NULL;
11458 length = PyUnicode_GET_LENGTH(self);
11459 kind = PyUnicode_KIND(self);
11460 data = PyUnicode_DATA(self);
11461
Guido van Rossumd57fd912000-03-10 22:53:23 +000011462 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011463 if (length == 1)
11464 return PyBool_FromLong(
11465 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011466
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011467 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011468 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011469 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011470
Guido van Rossumd57fd912000-03-10 22:53:23 +000011471 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011472 for (i = 0; i < length; i++) {
11473 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011474
Benjamin Peterson29060642009-01-31 22:14:21 +000011475 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11476 return PyBool_FromLong(0);
11477 else if (!cased && Py_UNICODE_ISLOWER(ch))
11478 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011479 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011480 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011481}
11482
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011483PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011484 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011485\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011486Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011487at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011488
11489static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011490unicode_isupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011491{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011492 Py_ssize_t i, length;
11493 int kind;
11494 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011495 int cased;
11496
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011497 if (PyUnicode_READY(self) == -1)
11498 return NULL;
11499 length = PyUnicode_GET_LENGTH(self);
11500 kind = PyUnicode_KIND(self);
11501 data = PyUnicode_DATA(self);
11502
Guido van Rossumd57fd912000-03-10 22:53:23 +000011503 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011504 if (length == 1)
11505 return PyBool_FromLong(
11506 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011507
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011508 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011509 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011510 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011511
Guido van Rossumd57fd912000-03-10 22:53:23 +000011512 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011513 for (i = 0; i < length; i++) {
11514 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011515
Benjamin Peterson29060642009-01-31 22:14:21 +000011516 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11517 return PyBool_FromLong(0);
11518 else if (!cased && Py_UNICODE_ISUPPER(ch))
11519 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011520 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011521 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011522}
11523
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011524PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011525 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011526\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011527Return True if S is a titlecased string and there is at least one\n\
11528character in S, i.e. upper- and titlecase characters may only\n\
11529follow uncased characters and lowercase characters only cased ones.\n\
11530Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011531
11532static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011533unicode_istitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011534{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011535 Py_ssize_t i, length;
11536 int kind;
11537 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011538 int cased, previous_is_cased;
11539
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011540 if (PyUnicode_READY(self) == -1)
11541 return NULL;
11542 length = PyUnicode_GET_LENGTH(self);
11543 kind = PyUnicode_KIND(self);
11544 data = PyUnicode_DATA(self);
11545
Guido van Rossumd57fd912000-03-10 22:53:23 +000011546 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011547 if (length == 1) {
11548 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11549 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11550 (Py_UNICODE_ISUPPER(ch) != 0));
11551 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011552
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011553 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011554 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011555 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011556
Guido van Rossumd57fd912000-03-10 22:53:23 +000011557 cased = 0;
11558 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011559 for (i = 0; i < length; i++) {
11560 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011561
Benjamin Peterson29060642009-01-31 22:14:21 +000011562 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11563 if (previous_is_cased)
11564 return PyBool_FromLong(0);
11565 previous_is_cased = 1;
11566 cased = 1;
11567 }
11568 else if (Py_UNICODE_ISLOWER(ch)) {
11569 if (!previous_is_cased)
11570 return PyBool_FromLong(0);
11571 previous_is_cased = 1;
11572 cased = 1;
11573 }
11574 else
11575 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011576 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011577 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011578}
11579
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011580PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011581 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011582\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011583Return True if all characters in S are whitespace\n\
11584and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011585
11586static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011587unicode_isspace(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011588{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011589 Py_ssize_t i, length;
11590 int kind;
11591 void *data;
11592
11593 if (PyUnicode_READY(self) == -1)
11594 return NULL;
11595 length = PyUnicode_GET_LENGTH(self);
11596 kind = PyUnicode_KIND(self);
11597 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011598
Guido van Rossumd57fd912000-03-10 22:53:23 +000011599 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011600 if (length == 1)
11601 return PyBool_FromLong(
11602 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011603
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011604 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011605 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011606 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011607
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011608 for (i = 0; i < length; i++) {
11609 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011610 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011611 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011612 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011613 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011614}
11615
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011616PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011617 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011618\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011619Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011620and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011621
11622static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011623unicode_isalpha(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011624{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011625 Py_ssize_t i, length;
11626 int kind;
11627 void *data;
11628
11629 if (PyUnicode_READY(self) == -1)
11630 return NULL;
11631 length = PyUnicode_GET_LENGTH(self);
11632 kind = PyUnicode_KIND(self);
11633 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011634
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011635 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011636 if (length == 1)
11637 return PyBool_FromLong(
11638 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011639
11640 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011641 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011642 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011643
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011644 for (i = 0; i < length; i++) {
11645 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011646 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011647 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011648 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011649}
11650
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011651PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011652 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011653\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011654Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011655and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011656
11657static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011658unicode_isalnum(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011659{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011660 int kind;
11661 void *data;
11662 Py_ssize_t len, i;
11663
11664 if (PyUnicode_READY(self) == -1)
11665 return NULL;
11666
11667 kind = PyUnicode_KIND(self);
11668 data = PyUnicode_DATA(self);
11669 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011670
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011671 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011672 if (len == 1) {
11673 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11674 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11675 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011676
11677 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011678 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011679 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011680
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011681 for (i = 0; i < len; i++) {
11682 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011683 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011684 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011685 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011686 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011687}
11688
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011689PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011690 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011691\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011692Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011693False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011694
11695static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011696unicode_isdecimal(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011697{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011698 Py_ssize_t i, length;
11699 int kind;
11700 void *data;
11701
11702 if (PyUnicode_READY(self) == -1)
11703 return NULL;
11704 length = PyUnicode_GET_LENGTH(self);
11705 kind = PyUnicode_KIND(self);
11706 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011707
Guido van Rossumd57fd912000-03-10 22:53:23 +000011708 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011709 if (length == 1)
11710 return PyBool_FromLong(
11711 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011712
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011713 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011714 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011715 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011716
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011717 for (i = 0; i < length; i++) {
11718 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011719 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011720 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011721 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011722}
11723
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011724PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011725 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011726\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011727Return True if all characters in S are digits\n\
11728and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011729
11730static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011731unicode_isdigit(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011732{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011733 Py_ssize_t i, length;
11734 int kind;
11735 void *data;
11736
11737 if (PyUnicode_READY(self) == -1)
11738 return NULL;
11739 length = PyUnicode_GET_LENGTH(self);
11740 kind = PyUnicode_KIND(self);
11741 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011742
Guido van Rossumd57fd912000-03-10 22:53:23 +000011743 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011744 if (length == 1) {
11745 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11746 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11747 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011748
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011749 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011750 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011751 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011752
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011753 for (i = 0; i < length; i++) {
11754 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011755 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011756 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011757 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011758}
11759
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011760PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011761 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011762\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011763Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011764False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011765
11766static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011767unicode_isnumeric(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011768{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011769 Py_ssize_t i, length;
11770 int kind;
11771 void *data;
11772
11773 if (PyUnicode_READY(self) == -1)
11774 return NULL;
11775 length = PyUnicode_GET_LENGTH(self);
11776 kind = PyUnicode_KIND(self);
11777 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011778
Guido van Rossumd57fd912000-03-10 22:53:23 +000011779 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011780 if (length == 1)
11781 return PyBool_FromLong(
11782 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011783
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011784 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011785 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011786 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011787
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011788 for (i = 0; i < length; i++) {
11789 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011790 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011791 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011792 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011793}
11794
Martin v. Löwis47383402007-08-15 07:32:56 +000011795int
11796PyUnicode_IsIdentifier(PyObject *self)
11797{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011798 int kind;
11799 void *data;
11800 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011801 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000011802
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011803 if (PyUnicode_READY(self) == -1) {
11804 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000011805 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011806 }
11807
11808 /* Special case for empty strings */
11809 if (PyUnicode_GET_LENGTH(self) == 0)
11810 return 0;
11811 kind = PyUnicode_KIND(self);
11812 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000011813
11814 /* PEP 3131 says that the first character must be in
11815 XID_Start and subsequent characters in XID_Continue,
11816 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000011817 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000011818 letters, digits, underscore). However, given the current
11819 definition of XID_Start and XID_Continue, it is sufficient
11820 to check just for these, except that _ must be allowed
11821 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011822 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050011823 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000011824 return 0;
11825
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040011826 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011827 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011828 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000011829 return 1;
11830}
11831
11832PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011833 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000011834\n\
11835Return True if S is a valid identifier according\n\
Raymond Hettinger378170d2013-03-23 08:21:12 -070011836to the language definition.\n\
11837\n\
11838Use keyword.iskeyword() to test for reserved identifiers\n\
11839such as \"def\" and \"class\".\n");
Martin v. Löwis47383402007-08-15 07:32:56 +000011840
11841static PyObject*
11842unicode_isidentifier(PyObject *self)
11843{
11844 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11845}
11846
Georg Brandl559e5d72008-06-11 18:37:52 +000011847PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011848 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000011849\n\
11850Return True if all characters in S are considered\n\
11851printable in repr() or S is empty, False otherwise.");
11852
11853static PyObject*
11854unicode_isprintable(PyObject *self)
11855{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011856 Py_ssize_t i, length;
11857 int kind;
11858 void *data;
11859
11860 if (PyUnicode_READY(self) == -1)
11861 return NULL;
11862 length = PyUnicode_GET_LENGTH(self);
11863 kind = PyUnicode_KIND(self);
11864 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000011865
11866 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011867 if (length == 1)
11868 return PyBool_FromLong(
11869 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000011870
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011871 for (i = 0; i < length; i++) {
11872 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011873 Py_RETURN_FALSE;
11874 }
11875 }
11876 Py_RETURN_TRUE;
11877}
11878
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011879PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000011880 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011881\n\
11882Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000011883iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011884
11885static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011886unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011887{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011888 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011889}
11890
Martin v. Löwis18e16552006-02-15 17:27:45 +000011891static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011892unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011893{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011894 if (PyUnicode_READY(self) == -1)
11895 return -1;
11896 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011897}
11898
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011899PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011900 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011901\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011902Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011903done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011904
11905static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011906unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011907{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011908 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011909 Py_UCS4 fillchar = ' ';
11910
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011911 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011912 return NULL;
11913
Benjamin Petersonbac79492012-01-14 13:34:47 -050011914 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010011915 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011916
Victor Stinnerc4b49542011-12-11 22:44:26 +010011917 if (PyUnicode_GET_LENGTH(self) >= width)
11918 return unicode_result_unchanged(self);
11919
11920 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011921}
11922
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011923PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011924 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011925\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011926Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011927
11928static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011929unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011930{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050011931 if (PyUnicode_READY(self) == -1)
11932 return NULL;
11933 if (PyUnicode_IS_ASCII(self))
11934 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010011935 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011936}
11937
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011938#define LEFTSTRIP 0
11939#define RIGHTSTRIP 1
11940#define BOTHSTRIP 2
11941
11942/* Arrays indexed by above */
11943static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
11944
11945#define STRIPNAME(i) (stripformat[i]+3)
11946
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011947/* externally visible for str.strip(unicode) */
11948PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011949_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011950{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011951 void *data;
11952 int kind;
11953 Py_ssize_t i, j, len;
11954 BLOOM_MASK sepmask;
Victor Stinnerb3a60142013-04-09 22:19:21 +020011955 Py_ssize_t seplen;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011956
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011957 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
11958 return NULL;
11959
11960 kind = PyUnicode_KIND(self);
11961 data = PyUnicode_DATA(self);
11962 len = PyUnicode_GET_LENGTH(self);
Victor Stinnerb3a60142013-04-09 22:19:21 +020011963 seplen = PyUnicode_GET_LENGTH(sepobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011964 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11965 PyUnicode_DATA(sepobj),
Victor Stinnerb3a60142013-04-09 22:19:21 +020011966 seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011967
Benjamin Peterson14339b62009-01-31 16:36:08 +000011968 i = 0;
11969 if (striptype != RIGHTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020011970 while (i < len) {
11971 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11972 if (!BLOOM(sepmask, ch))
11973 break;
11974 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
11975 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000011976 i++;
11977 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011978 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011979
Benjamin Peterson14339b62009-01-31 16:36:08 +000011980 j = len;
11981 if (striptype != LEFTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020011982 j--;
11983 while (j >= i) {
11984 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
11985 if (!BLOOM(sepmask, ch))
11986 break;
11987 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
11988 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000011989 j--;
Victor Stinnerb3a60142013-04-09 22:19:21 +020011990 }
11991
Benjamin Peterson29060642009-01-31 22:14:21 +000011992 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011993 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011994
Victor Stinner7931d9a2011-11-04 00:22:48 +010011995 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011996}
11997
11998PyObject*
11999PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12000{
12001 unsigned char *data;
12002 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020012003 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012004
Victor Stinnerde636f32011-10-01 03:55:54 +020012005 if (PyUnicode_READY(self) == -1)
12006 return NULL;
12007
Victor Stinner684d5fd2012-05-03 02:32:34 +020012008 length = PyUnicode_GET_LENGTH(self);
12009 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020012010
Victor Stinner684d5fd2012-05-03 02:32:34 +020012011 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012012 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012013
Victor Stinnerde636f32011-10-01 03:55:54 +020012014 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012015 PyErr_SetString(PyExc_IndexError, "string index out of range");
12016 return NULL;
12017 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020012018 if (start >= length || end < start)
12019 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner12bab6d2011-10-01 01:53:49 +020012020
Victor Stinner684d5fd2012-05-03 02:32:34 +020012021 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020012022 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020012023 data = PyUnicode_1BYTE_DATA(self);
Victor Stinnerd3f08822012-05-29 12:57:52 +020012024 return _PyUnicode_FromASCII((char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020012025 }
12026 else {
12027 kind = PyUnicode_KIND(self);
12028 data = PyUnicode_1BYTE_DATA(self);
12029 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012030 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020012031 length);
12032 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012033}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012034
12035static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012036do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012037{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012038 Py_ssize_t len, i, j;
12039
12040 if (PyUnicode_READY(self) == -1)
12041 return NULL;
12042
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012043 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012044
Victor Stinnercc7af722013-04-09 22:39:24 +020012045 if (PyUnicode_IS_ASCII(self)) {
12046 Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12047
12048 i = 0;
12049 if (striptype != RIGHTSTRIP) {
12050 while (i < len) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012051 Py_UCS1 ch = data[i];
Victor Stinnercc7af722013-04-09 22:39:24 +020012052 if (!_Py_ascii_whitespace[ch])
12053 break;
12054 i++;
12055 }
12056 }
12057
12058 j = len;
12059 if (striptype != LEFTSTRIP) {
12060 j--;
12061 while (j >= i) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012062 Py_UCS1 ch = data[j];
Victor Stinnercc7af722013-04-09 22:39:24 +020012063 if (!_Py_ascii_whitespace[ch])
12064 break;
12065 j--;
12066 }
12067 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012068 }
12069 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012070 else {
12071 int kind = PyUnicode_KIND(self);
12072 void *data = PyUnicode_DATA(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012073
Victor Stinnercc7af722013-04-09 22:39:24 +020012074 i = 0;
12075 if (striptype != RIGHTSTRIP) {
12076 while (i < len) {
12077 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12078 if (!Py_UNICODE_ISSPACE(ch))
12079 break;
12080 i++;
12081 }
Victor Stinner9c79e412013-04-09 22:21:08 +020012082 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012083
12084 j = len;
12085 if (striptype != LEFTSTRIP) {
12086 j--;
12087 while (j >= i) {
12088 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12089 if (!Py_UNICODE_ISSPACE(ch))
12090 break;
12091 j--;
12092 }
12093 j++;
12094 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012095 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012096
Victor Stinner7931d9a2011-11-04 00:22:48 +010012097 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012098}
12099
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012100
12101static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012102do_argstrip(PyObject *self, int striptype, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012103{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012104 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012105
Serhiy Storchakac6792272013-10-19 21:03:34 +030012106 if (!PyArg_ParseTuple(args, stripformat[striptype], &sep))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012107 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012108
Benjamin Peterson14339b62009-01-31 16:36:08 +000012109 if (sep != NULL && sep != Py_None) {
12110 if (PyUnicode_Check(sep))
12111 return _PyUnicode_XStrip(self, striptype, sep);
12112 else {
12113 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012114 "%s arg must be None or str",
12115 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012116 return NULL;
12117 }
12118 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012119
Benjamin Peterson14339b62009-01-31 16:36:08 +000012120 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012121}
12122
12123
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012124PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012125 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012126\n\
12127Return a copy of the string S with leading and trailing\n\
12128whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012129If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012130
12131static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012132unicode_strip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012133{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012134 if (PyTuple_GET_SIZE(args) == 0)
12135 return do_strip(self, BOTHSTRIP); /* Common case */
12136 else
12137 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012138}
12139
12140
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012141PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012142 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012143\n\
12144Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012145If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012146
12147static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012148unicode_lstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012149{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012150 if (PyTuple_GET_SIZE(args) == 0)
12151 return do_strip(self, LEFTSTRIP); /* Common case */
12152 else
12153 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012154}
12155
12156
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012157PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012158 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012159\n\
12160Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012161If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012162
12163static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012164unicode_rstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012165{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012166 if (PyTuple_GET_SIZE(args) == 0)
12167 return do_strip(self, RIGHTSTRIP); /* Common case */
12168 else
12169 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012170}
12171
12172
Guido van Rossumd57fd912000-03-10 22:53:23 +000012173static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012174unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012175{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012176 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012177 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012178
Serhiy Storchaka05997252013-01-26 12:14:02 +020012179 if (len < 1)
12180 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012181
Victor Stinnerc4b49542011-12-11 22:44:26 +010012182 /* no repeat, return original string */
12183 if (len == 1)
12184 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000012185
Benjamin Petersonbac79492012-01-14 13:34:47 -050012186 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012187 return NULL;
12188
Victor Stinnerc759f3e2011-10-01 03:09:58 +020012189 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020012190 PyErr_SetString(PyExc_OverflowError,
12191 "repeated string is too long");
12192 return NULL;
12193 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012194 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012195
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012196 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012197 if (!u)
12198 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012199 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012200
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012201 if (PyUnicode_GET_LENGTH(str) == 1) {
12202 const int kind = PyUnicode_KIND(str);
12203 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010012204 if (kind == PyUnicode_1BYTE_KIND) {
12205 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012206 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010012207 }
12208 else if (kind == PyUnicode_2BYTE_KIND) {
12209 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012210 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010012211 ucs2[n] = fill_char;
12212 } else {
12213 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12214 assert(kind == PyUnicode_4BYTE_KIND);
12215 for (n = 0; n < len; ++n)
12216 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012217 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012218 }
12219 else {
12220 /* number of characters copied this far */
12221 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012222 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012223 char *to = (char *) PyUnicode_DATA(u);
12224 Py_MEMCPY(to, PyUnicode_DATA(str),
12225 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000012226 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012227 n = (done <= nchars-done) ? done : nchars-done;
12228 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012229 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000012230 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012231 }
12232
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012233 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012234 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012235}
12236
Alexander Belopolsky40018472011-02-26 01:02:56 +000012237PyObject *
12238PyUnicode_Replace(PyObject *obj,
12239 PyObject *subobj,
12240 PyObject *replobj,
12241 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012242{
12243 PyObject *self;
12244 PyObject *str1;
12245 PyObject *str2;
12246 PyObject *result;
12247
12248 self = PyUnicode_FromObject(obj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012249 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012250 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012251 str1 = PyUnicode_FromObject(subobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012252 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012253 Py_DECREF(self);
12254 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012255 }
12256 str2 = PyUnicode_FromObject(replobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012257 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012258 Py_DECREF(self);
12259 Py_DECREF(str1);
12260 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012261 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060012262 if (PyUnicode_READY(self) == -1 ||
12263 PyUnicode_READY(str1) == -1 ||
12264 PyUnicode_READY(str2) == -1)
12265 result = NULL;
12266 else
12267 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012268 Py_DECREF(self);
12269 Py_DECREF(str1);
12270 Py_DECREF(str2);
12271 return result;
12272}
12273
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012274PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000012275 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012276\n\
12277Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000012278old replaced by new. If the optional argument count is\n\
12279given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012280
12281static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012282unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012283{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012284 PyObject *str1;
12285 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012286 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012287 PyObject *result;
12288
Martin v. Löwis18e16552006-02-15 17:27:45 +000012289 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012290 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060012291 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012292 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012293 str1 = PyUnicode_FromObject(str1);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012294 if (str1 == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012295 return NULL;
12296 str2 = PyUnicode_FromObject(str2);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012297 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012298 Py_DECREF(str1);
12299 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000012300 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060012301 if (PyUnicode_READY(str1) == -1 || PyUnicode_READY(str2) == -1)
12302 result = NULL;
12303 else
12304 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012305
12306 Py_DECREF(str1);
12307 Py_DECREF(str2);
12308 return result;
12309}
12310
Alexander Belopolsky40018472011-02-26 01:02:56 +000012311static PyObject *
12312unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012313{
Walter Dörwald79e913e2007-05-12 11:08:06 +000012314 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012315 Py_ssize_t isize;
12316 Py_ssize_t osize, squote, dquote, i, o;
12317 Py_UCS4 max, quote;
Victor Stinner55c08782013-04-14 18:45:39 +020012318 int ikind, okind, unchanged;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012319 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012320
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012321 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012322 return NULL;
12323
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012324 isize = PyUnicode_GET_LENGTH(unicode);
12325 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012326
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012327 /* Compute length of output, quote characters, and
12328 maximum character */
Victor Stinner55c08782013-04-14 18:45:39 +020012329 osize = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012330 max = 127;
12331 squote = dquote = 0;
12332 ikind = PyUnicode_KIND(unicode);
12333 for (i = 0; i < isize; i++) {
12334 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12335 switch (ch) {
12336 case '\'': squote++; osize++; break;
12337 case '"': dquote++; osize++; break;
12338 case '\\': case '\t': case '\r': case '\n':
12339 osize += 2; break;
12340 default:
12341 /* Fast-path ASCII */
12342 if (ch < ' ' || ch == 0x7f)
12343 osize += 4; /* \xHH */
12344 else if (ch < 0x7f)
12345 osize++;
12346 else if (Py_UNICODE_ISPRINTABLE(ch)) {
12347 osize++;
12348 max = ch > max ? ch : max;
12349 }
12350 else if (ch < 0x100)
12351 osize += 4; /* \xHH */
12352 else if (ch < 0x10000)
12353 osize += 6; /* \uHHHH */
12354 else
12355 osize += 10; /* \uHHHHHHHH */
12356 }
12357 }
12358
12359 quote = '\'';
Victor Stinner55c08782013-04-14 18:45:39 +020012360 unchanged = (osize == isize);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012361 if (squote) {
Victor Stinner55c08782013-04-14 18:45:39 +020012362 unchanged = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012363 if (dquote)
12364 /* Both squote and dquote present. Use squote,
12365 and escape them */
12366 osize += squote;
12367 else
12368 quote = '"';
12369 }
Victor Stinner55c08782013-04-14 18:45:39 +020012370 osize += 2; /* quotes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012371
12372 repr = PyUnicode_New(osize, max);
12373 if (repr == NULL)
12374 return NULL;
12375 okind = PyUnicode_KIND(repr);
12376 odata = PyUnicode_DATA(repr);
12377
12378 PyUnicode_WRITE(okind, odata, 0, quote);
12379 PyUnicode_WRITE(okind, odata, osize-1, quote);
Victor Stinner55c08782013-04-14 18:45:39 +020012380 if (unchanged) {
12381 _PyUnicode_FastCopyCharacters(repr, 1,
12382 unicode, 0,
12383 isize);
12384 }
12385 else {
12386 for (i = 0, o = 1; i < isize; i++) {
12387 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012388
Victor Stinner55c08782013-04-14 18:45:39 +020012389 /* Escape quotes and backslashes */
12390 if ((ch == quote) || (ch == '\\')) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012391 PyUnicode_WRITE(okind, odata, o++, '\\');
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012392 PyUnicode_WRITE(okind, odata, o++, ch);
Victor Stinner55c08782013-04-14 18:45:39 +020012393 continue;
12394 }
12395
12396 /* Map special whitespace to '\t', \n', '\r' */
12397 if (ch == '\t') {
12398 PyUnicode_WRITE(okind, odata, o++, '\\');
12399 PyUnicode_WRITE(okind, odata, o++, 't');
12400 }
12401 else if (ch == '\n') {
12402 PyUnicode_WRITE(okind, odata, o++, '\\');
12403 PyUnicode_WRITE(okind, odata, o++, 'n');
12404 }
12405 else if (ch == '\r') {
12406 PyUnicode_WRITE(okind, odata, o++, '\\');
12407 PyUnicode_WRITE(okind, odata, o++, 'r');
12408 }
12409
12410 /* Map non-printable US ASCII to '\xhh' */
12411 else if (ch < ' ' || ch == 0x7F) {
12412 PyUnicode_WRITE(okind, odata, o++, '\\');
12413 PyUnicode_WRITE(okind, odata, o++, 'x');
12414 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12415 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12416 }
12417
12418 /* Copy ASCII characters as-is */
12419 else if (ch < 0x7F) {
12420 PyUnicode_WRITE(okind, odata, o++, ch);
12421 }
12422
12423 /* Non-ASCII characters */
12424 else {
12425 /* Map Unicode whitespace and control characters
12426 (categories Z* and C* except ASCII space)
12427 */
12428 if (!Py_UNICODE_ISPRINTABLE(ch)) {
12429 PyUnicode_WRITE(okind, odata, o++, '\\');
12430 /* Map 8-bit characters to '\xhh' */
12431 if (ch <= 0xff) {
12432 PyUnicode_WRITE(okind, odata, o++, 'x');
12433 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12434 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12435 }
12436 /* Map 16-bit characters to '\uxxxx' */
12437 else if (ch <= 0xffff) {
12438 PyUnicode_WRITE(okind, odata, o++, 'u');
12439 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12440 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12441 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12442 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12443 }
12444 /* Map 21-bit characters to '\U00xxxxxx' */
12445 else {
12446 PyUnicode_WRITE(okind, odata, o++, 'U');
12447 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12448 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12449 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12450 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12451 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12452 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12453 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12454 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12455 }
12456 }
12457 /* Copy characters as-is */
12458 else {
12459 PyUnicode_WRITE(okind, odata, o++, ch);
12460 }
Georg Brandl559e5d72008-06-11 18:37:52 +000012461 }
12462 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012463 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012464 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012465 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012466 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012467}
12468
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012469PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012470 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012471\n\
12472Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012473such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012474arguments start and end are interpreted as in slice notation.\n\
12475\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012476Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012477
12478static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012479unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012480{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012481 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012482 Py_ssize_t start;
12483 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012484 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012485
Jesus Ceaac451502011-04-20 17:09:23 +020012486 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
12487 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012488 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012489
Christian Heimesea71a522013-06-29 21:17:34 +020012490 if (PyUnicode_READY(self) == -1) {
12491 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012492 return NULL;
Christian Heimesea71a522013-06-29 21:17:34 +020012493 }
12494 if (PyUnicode_READY(substring) == -1) {
12495 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012496 return NULL;
Christian Heimesea71a522013-06-29 21:17:34 +020012497 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012498
Victor Stinner7931d9a2011-11-04 00:22:48 +010012499 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012500
12501 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012502
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012503 if (result == -2)
12504 return NULL;
12505
Christian Heimes217cfd12007-12-02 14:31:20 +000012506 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012507}
12508
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012509PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012510 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012511\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012512Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012513
12514static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012515unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012516{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012517 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012518 Py_ssize_t start;
12519 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012520 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012521
Jesus Ceaac451502011-04-20 17:09:23 +020012522 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
12523 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012524 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012525
Christian Heimesea71a522013-06-29 21:17:34 +020012526 if (PyUnicode_READY(self) == -1) {
12527 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012528 return NULL;
Christian Heimesea71a522013-06-29 21:17:34 +020012529 }
12530 if (PyUnicode_READY(substring) == -1) {
12531 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012532 return NULL;
Christian Heimesea71a522013-06-29 21:17:34 +020012533 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012534
Victor Stinner7931d9a2011-11-04 00:22:48 +010012535 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012536
12537 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012538
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012539 if (result == -2)
12540 return NULL;
12541
Guido van Rossumd57fd912000-03-10 22:53:23 +000012542 if (result < 0) {
12543 PyErr_SetString(PyExc_ValueError, "substring not found");
12544 return NULL;
12545 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012546
Christian Heimes217cfd12007-12-02 14:31:20 +000012547 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012548}
12549
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012550PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012551 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012552\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012553Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012554done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012555
12556static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012557unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012558{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012559 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012560 Py_UCS4 fillchar = ' ';
12561
Victor Stinnere9a29352011-10-01 02:14:59 +020012562 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012563 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012564
Benjamin Petersonbac79492012-01-14 13:34:47 -050012565 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012566 return NULL;
12567
Victor Stinnerc4b49542011-12-11 22:44:26 +010012568 if (PyUnicode_GET_LENGTH(self) >= width)
12569 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012570
Victor Stinnerc4b49542011-12-11 22:44:26 +010012571 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012572}
12573
Alexander Belopolsky40018472011-02-26 01:02:56 +000012574PyObject *
12575PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012576{
12577 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000012578
Guido van Rossumd57fd912000-03-10 22:53:23 +000012579 s = PyUnicode_FromObject(s);
12580 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012581 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012582 if (sep != NULL) {
12583 sep = PyUnicode_FromObject(sep);
12584 if (sep == NULL) {
12585 Py_DECREF(s);
12586 return NULL;
12587 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012588 }
12589
Victor Stinner9310abb2011-10-05 00:59:23 +020012590 result = split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012591
12592 Py_DECREF(s);
12593 Py_XDECREF(sep);
12594 return result;
12595}
12596
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012597PyDoc_STRVAR(split__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012598 "S.split(sep=None, maxsplit=-1) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012599\n\
12600Return a list of the words in S, using sep as the\n\
12601delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000012602splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000012603whitespace string is a separator and empty strings are\n\
12604removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012605
12606static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012607unicode_split(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012608{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012609 static char *kwlist[] = {"sep", "maxsplit", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000012610 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012611 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012612
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012613 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:split",
12614 kwlist, &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012615 return NULL;
12616
12617 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012618 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012619 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012620 return split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012621 else
Victor Stinner7931d9a2011-11-04 00:22:48 +010012622 return PyUnicode_Split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012623}
12624
Thomas Wouters477c8d52006-05-27 19:21:47 +000012625PyObject *
12626PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
12627{
12628 PyObject* str_obj;
12629 PyObject* sep_obj;
12630 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012631 int kind1, kind2, kind;
12632 void *buf1 = NULL, *buf2 = NULL;
12633 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012634
12635 str_obj = PyUnicode_FromObject(str_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012636 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012637 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012638 sep_obj = PyUnicode_FromObject(sep_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012639 if (!sep_obj) {
12640 Py_DECREF(str_obj);
12641 return NULL;
12642 }
12643 if (PyUnicode_READY(sep_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
12644 Py_DECREF(sep_obj);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012645 Py_DECREF(str_obj);
12646 return NULL;
12647 }
12648
Victor Stinner14f8f022011-10-05 20:58:25 +020012649 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012650 kind2 = PyUnicode_KIND(sep_obj);
Victor Stinner14f8f022011-10-05 20:58:25 +020012651 kind = Py_MAX(kind1, kind2);
12652 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012653 if (kind1 != kind)
Victor Stinner14f8f022011-10-05 20:58:25 +020012654 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012655 if (!buf1)
12656 goto onError;
12657 buf2 = PyUnicode_DATA(sep_obj);
12658 if (kind2 != kind)
12659 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12660 if (!buf2)
12661 goto onError;
12662 len1 = PyUnicode_GET_LENGTH(str_obj);
12663 len2 = PyUnicode_GET_LENGTH(sep_obj);
12664
Benjamin Petersonead6b532011-12-20 17:23:42 -060012665 switch (PyUnicode_KIND(str_obj)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012666 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012667 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12668 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12669 else
12670 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012671 break;
12672 case PyUnicode_2BYTE_KIND:
12673 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12674 break;
12675 case PyUnicode_4BYTE_KIND:
12676 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12677 break;
12678 default:
12679 assert(0);
12680 out = 0;
12681 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012682
12683 Py_DECREF(sep_obj);
12684 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012685 if (kind1 != kind)
12686 PyMem_Free(buf1);
12687 if (kind2 != kind)
12688 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012689
12690 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012691 onError:
12692 Py_DECREF(sep_obj);
12693 Py_DECREF(str_obj);
12694 if (kind1 != kind && buf1)
12695 PyMem_Free(buf1);
12696 if (kind2 != kind && buf2)
12697 PyMem_Free(buf2);
12698 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012699}
12700
12701
12702PyObject *
12703PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
12704{
12705 PyObject* str_obj;
12706 PyObject* sep_obj;
12707 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012708 int kind1, kind2, kind;
12709 void *buf1 = NULL, *buf2 = NULL;
12710 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012711
12712 str_obj = PyUnicode_FromObject(str_in);
12713 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012714 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012715 sep_obj = PyUnicode_FromObject(sep_in);
12716 if (!sep_obj) {
12717 Py_DECREF(str_obj);
12718 return NULL;
12719 }
12720
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012721 kind1 = PyUnicode_KIND(str_in);
12722 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020012723 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012724 buf1 = PyUnicode_DATA(str_in);
12725 if (kind1 != kind)
12726 buf1 = _PyUnicode_AsKind(str_in, kind);
12727 if (!buf1)
12728 goto onError;
12729 buf2 = PyUnicode_DATA(sep_obj);
12730 if (kind2 != kind)
12731 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12732 if (!buf2)
12733 goto onError;
12734 len1 = PyUnicode_GET_LENGTH(str_obj);
12735 len2 = PyUnicode_GET_LENGTH(sep_obj);
12736
Benjamin Petersonead6b532011-12-20 17:23:42 -060012737 switch (PyUnicode_KIND(str_in)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012738 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012739 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12740 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12741 else
12742 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012743 break;
12744 case PyUnicode_2BYTE_KIND:
12745 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12746 break;
12747 case PyUnicode_4BYTE_KIND:
12748 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12749 break;
12750 default:
12751 assert(0);
12752 out = 0;
12753 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012754
12755 Py_DECREF(sep_obj);
12756 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012757 if (kind1 != kind)
12758 PyMem_Free(buf1);
12759 if (kind2 != kind)
12760 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012761
12762 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012763 onError:
12764 Py_DECREF(sep_obj);
12765 Py_DECREF(str_obj);
12766 if (kind1 != kind && buf1)
12767 PyMem_Free(buf1);
12768 if (kind2 != kind && buf2)
12769 PyMem_Free(buf2);
12770 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012771}
12772
12773PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012774 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012775\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012776Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012777the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012778found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012779
12780static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012781unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012782{
Victor Stinner9310abb2011-10-05 00:59:23 +020012783 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012784}
12785
12786PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000012787 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012788\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012789Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012790the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012791separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012792
12793static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012794unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012795{
Victor Stinner9310abb2011-10-05 00:59:23 +020012796 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012797}
12798
Alexander Belopolsky40018472011-02-26 01:02:56 +000012799PyObject *
12800PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012801{
12802 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012803
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012804 s = PyUnicode_FromObject(s);
12805 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012806 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012807 if (sep != NULL) {
12808 sep = PyUnicode_FromObject(sep);
12809 if (sep == NULL) {
12810 Py_DECREF(s);
12811 return NULL;
12812 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012813 }
12814
Victor Stinner9310abb2011-10-05 00:59:23 +020012815 result = rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012816
12817 Py_DECREF(s);
12818 Py_XDECREF(sep);
12819 return result;
12820}
12821
12822PyDoc_STRVAR(rsplit__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012823 "S.rsplit(sep=None, maxsplit=-1) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012824\n\
12825Return a list of the words in S, using sep as the\n\
12826delimiter string, starting at the end of the string and\n\
12827working to the front. If maxsplit is given, at most maxsplit\n\
12828splits are done. If sep is not specified, any whitespace string\n\
12829is a separator.");
12830
12831static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012832unicode_rsplit(PyObject *self, PyObject *args, PyObject *kwds)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012833{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012834 static char *kwlist[] = {"sep", "maxsplit", 0};
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012835 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012836 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012837
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012838 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:rsplit",
12839 kwlist, &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012840 return NULL;
12841
12842 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012843 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012844 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012845 return rsplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012846 else
Victor Stinner9310abb2011-10-05 00:59:23 +020012847 return PyUnicode_RSplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012848}
12849
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012850PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012851 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012852\n\
12853Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000012854Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012855is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012856
12857static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012858unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012859{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012860 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000012861 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012862
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012863 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12864 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012865 return NULL;
12866
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012867 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012868}
12869
12870static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000012871PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012872{
Victor Stinnerc4b49542011-12-11 22:44:26 +010012873 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012874}
12875
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012876PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012877 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012878\n\
12879Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012880and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012881
12882static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012883unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012884{
Benjamin Petersoneea48462012-01-16 14:28:50 -050012885 if (PyUnicode_READY(self) == -1)
12886 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012887 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012888}
12889
Larry Hastings31826802013-10-19 00:09:25 -070012890/*[clinic]
Georg Brandlceee0772007-11-27 23:48:05 +000012891
Larry Hastings31826802013-10-19 00:09:25 -070012892@staticmethod
12893str.maketrans as unicode_maketrans
12894
12895 x: object
12896
12897 y: unicode=NULL
12898
12899 z: unicode=NULL
12900
12901 /
12902
12903Return a translation table usable for str.translate().
12904
12905If there is only one argument, it must be a dictionary mapping Unicode
12906ordinals (integers) or characters to Unicode ordinals, strings or None.
12907Character keys will be then converted to ordinals.
12908If there are two arguments, they must be strings of equal length, and
12909in the resulting dictionary, each character in x will be mapped to the
12910character at the same position in y. If there is a third argument, it
12911must be a string, whose characters will be mapped to None in the result.
12912[clinic]*/
12913
12914PyDoc_STRVAR(unicode_maketrans__doc__,
Larry Hastings44e2eaa2013-11-23 15:37:55 -080012915"maketrans(x, y=None, z=None)\n"
Larry Hastings31826802013-10-19 00:09:25 -070012916"Return a translation table usable for str.translate().\n"
12917"\n"
Larry Hastings31826802013-10-19 00:09:25 -070012918"If there is only one argument, it must be a dictionary mapping Unicode\n"
12919"ordinals (integers) or characters to Unicode ordinals, strings or None.\n"
12920"Character keys will be then converted to ordinals.\n"
12921"If there are two arguments, they must be strings of equal length, and\n"
12922"in the resulting dictionary, each character in x will be mapped to the\n"
12923"character at the same position in y. If there is a third argument, it\n"
12924"must be a string, whose characters will be mapped to None in the result.");
12925
12926#define UNICODE_MAKETRANS_METHODDEF \
12927 {"maketrans", (PyCFunction)unicode_maketrans, METH_VARARGS|METH_STATIC, unicode_maketrans__doc__},
12928
12929static PyObject *
Larry Hastingsebdcb502013-11-23 14:54:00 -080012930unicode_maketrans_impl(void *null, PyObject *x, PyObject *y, PyObject *z);
Larry Hastings31826802013-10-19 00:09:25 -070012931
12932static PyObject *
Larry Hastingsebdcb502013-11-23 14:54:00 -080012933unicode_maketrans(void *null, PyObject *args)
Georg Brandlceee0772007-11-27 23:48:05 +000012934{
Larry Hastings31826802013-10-19 00:09:25 -070012935 PyObject *return_value = NULL;
12936 PyObject *x;
12937 PyObject *y = NULL;
12938 PyObject *z = NULL;
12939
12940 if (!PyArg_ParseTuple(args,
12941 "O|UU:maketrans",
12942 &x, &y, &z))
12943 goto exit;
Larry Hastingsebdcb502013-11-23 14:54:00 -080012944 return_value = unicode_maketrans_impl(null, x, y, z);
Larry Hastings31826802013-10-19 00:09:25 -070012945
12946exit:
12947 return return_value;
12948}
12949
12950static PyObject *
Larry Hastingsebdcb502013-11-23 14:54:00 -080012951unicode_maketrans_impl(void *null, PyObject *x, PyObject *y, PyObject *z)
Larry Hastings44e2eaa2013-11-23 15:37:55 -080012952/*[clinic checksum: 7f76f414a0dfd0c614e0d4717872eeb520516da7]*/
Larry Hastings31826802013-10-19 00:09:25 -070012953{
Georg Brandlceee0772007-11-27 23:48:05 +000012954 PyObject *new = NULL, *key, *value;
12955 Py_ssize_t i = 0;
12956 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012957
Georg Brandlceee0772007-11-27 23:48:05 +000012958 new = PyDict_New();
12959 if (!new)
12960 return NULL;
12961 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012962 int x_kind, y_kind, z_kind;
12963 void *x_data, *y_data, *z_data;
12964
Georg Brandlceee0772007-11-27 23:48:05 +000012965 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000012966 if (!PyUnicode_Check(x)) {
12967 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12968 "be a string if there is a second argument");
12969 goto err;
12970 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012971 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012972 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12973 "arguments must have equal length");
12974 goto err;
12975 }
12976 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012977 x_kind = PyUnicode_KIND(x);
12978 y_kind = PyUnicode_KIND(y);
12979 x_data = PyUnicode_DATA(x);
12980 y_data = PyUnicode_DATA(y);
12981 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
12982 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012983 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000012984 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060012985 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012986 if (!value) {
12987 Py_DECREF(key);
12988 goto err;
12989 }
Georg Brandlceee0772007-11-27 23:48:05 +000012990 res = PyDict_SetItem(new, key, value);
12991 Py_DECREF(key);
12992 Py_DECREF(value);
12993 if (res < 0)
12994 goto err;
12995 }
12996 /* create entries for deleting chars in z */
12997 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012998 z_kind = PyUnicode_KIND(z);
12999 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013000 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013001 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000013002 if (!key)
13003 goto err;
13004 res = PyDict_SetItem(new, key, Py_None);
13005 Py_DECREF(key);
13006 if (res < 0)
13007 goto err;
13008 }
13009 }
13010 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013011 int kind;
13012 void *data;
13013
Georg Brandlceee0772007-11-27 23:48:05 +000013014 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000013015 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013016 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13017 "to maketrans it must be a dict");
13018 goto err;
13019 }
13020 /* copy entries into the new dict, converting string keys to int keys */
13021 while (PyDict_Next(x, &i, &key, &value)) {
13022 if (PyUnicode_Check(key)) {
13023 /* convert string keys to integer keys */
13024 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013025 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000013026 PyErr_SetString(PyExc_ValueError, "string keys in translate "
13027 "table must be of length 1");
13028 goto err;
13029 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013030 kind = PyUnicode_KIND(key);
13031 data = PyUnicode_DATA(key);
13032 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000013033 if (!newkey)
13034 goto err;
13035 res = PyDict_SetItem(new, newkey, value);
13036 Py_DECREF(newkey);
13037 if (res < 0)
13038 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000013039 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013040 /* just keep integer keys */
13041 if (PyDict_SetItem(new, key, value) < 0)
13042 goto err;
13043 } else {
13044 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13045 "be strings or integers");
13046 goto err;
13047 }
13048 }
13049 }
13050 return new;
13051 err:
13052 Py_DECREF(new);
13053 return NULL;
13054}
13055
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013056PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013057 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013058\n\
13059Return a copy of the string S, where all characters have been mapped\n\
13060through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000013061Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000013062Unmapped characters are left untouched. Characters mapped to None\n\
13063are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013064
13065static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013066unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013067{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013068 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013069}
13070
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013071PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013072 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013073\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013074Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013075
13076static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020013077unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013078{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050013079 if (PyUnicode_READY(self) == -1)
13080 return NULL;
13081 if (PyUnicode_IS_ASCII(self))
13082 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013083 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013084}
13085
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013086PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013087 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013088\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000013089Pad a numeric string S with zeros on the left, to fill a field\n\
13090of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013091
13092static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020013093unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013094{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013095 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020013096 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013097 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013098 int kind;
13099 void *data;
13100 Py_UCS4 chr;
13101
Martin v. Löwis18e16552006-02-15 17:27:45 +000013102 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000013103 return NULL;
13104
Benjamin Petersonbac79492012-01-14 13:34:47 -050013105 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010013106 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013107
Victor Stinnerc4b49542011-12-11 22:44:26 +010013108 if (PyUnicode_GET_LENGTH(self) >= width)
13109 return unicode_result_unchanged(self);
13110
13111 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013112
13113 u = pad(self, fill, 0, '0');
13114
Walter Dörwald068325e2002-04-15 13:36:47 +000013115 if (u == NULL)
13116 return NULL;
13117
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013118 kind = PyUnicode_KIND(u);
13119 data = PyUnicode_DATA(u);
13120 chr = PyUnicode_READ(kind, data, fill);
13121
13122 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000013123 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013124 PyUnicode_WRITE(kind, data, 0, chr);
13125 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000013126 }
13127
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013128 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010013129 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013130}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013131
13132#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013133static PyObject *
13134unicode__decimal2ascii(PyObject *self)
13135{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013136 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013137}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013138#endif
13139
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013140PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013141 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013142\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013143Return True if S starts with the specified prefix, False otherwise.\n\
13144With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013145With optional end, stop comparing S at that position.\n\
13146prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013147
13148static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013149unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013150 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013151{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013152 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013153 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013154 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013155 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013156 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013157
Jesus Ceaac451502011-04-20 17:09:23 +020013158 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013159 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013160 if (PyTuple_Check(subobj)) {
13161 Py_ssize_t i;
13162 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013163 substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013164 if (substring == NULL)
13165 return NULL;
13166 result = tailmatch(self, substring, start, end, -1);
13167 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010013168 if (result == -1)
13169 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013170 if (result) {
13171 Py_RETURN_TRUE;
13172 }
13173 }
13174 /* nothing matched */
13175 Py_RETURN_FALSE;
13176 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013177 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030013178 if (substring == NULL) {
13179 if (PyErr_ExceptionMatches(PyExc_TypeError))
13180 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
13181 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013182 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013183 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013184 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013185 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010013186 if (result == -1)
13187 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013188 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013189}
13190
13191
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013192PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013193 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013194\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013195Return True if S ends with the specified suffix, False otherwise.\n\
13196With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013197With optional end, stop comparing S at that position.\n\
13198suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013199
13200static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013201unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013202 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013203{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013204 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013205 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013206 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013207 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013208 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013209
Jesus Ceaac451502011-04-20 17:09:23 +020013210 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013211 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013212 if (PyTuple_Check(subobj)) {
13213 Py_ssize_t i;
13214 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013215 substring = PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000013216 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013217 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000013218 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013219 result = tailmatch(self, substring, start, end, +1);
13220 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010013221 if (result == -1)
13222 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013223 if (result) {
13224 Py_RETURN_TRUE;
13225 }
13226 }
13227 Py_RETURN_FALSE;
13228 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013229 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030013230 if (substring == NULL) {
13231 if (PyErr_ExceptionMatches(PyExc_TypeError))
13232 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
13233 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013234 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013235 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013236 result = tailmatch(self, substring, start, end, +1);
Christian Heimes305e49e2013-06-29 20:41:06 +020013237 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010013238 if (result == -1)
13239 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013240 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013241}
13242
Victor Stinner202fdca2012-05-07 12:47:02 +020013243Py_LOCAL_INLINE(void)
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013244_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013245{
Victor Stinner8f674cc2013-04-17 23:02:17 +020013246 if (!writer->readonly)
13247 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
13248 else {
13249 /* Copy-on-write mode: set buffer size to 0 so
13250 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13251 * next write. */
13252 writer->size = 0;
13253 }
Victor Stinner202fdca2012-05-07 12:47:02 +020013254 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13255 writer->data = PyUnicode_DATA(writer->buffer);
13256 writer->kind = PyUnicode_KIND(writer->buffer);
13257}
13258
Victor Stinnerd3f08822012-05-29 12:57:52 +020013259void
Victor Stinner8f674cc2013-04-17 23:02:17 +020013260_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013261{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013262 memset(writer, 0, sizeof(*writer));
13263#ifdef Py_DEBUG
13264 writer->kind = 5; /* invalid kind */
13265#endif
Victor Stinner8f674cc2013-04-17 23:02:17 +020013266 writer->min_char = 127;
Victor Stinner202fdca2012-05-07 12:47:02 +020013267}
13268
Victor Stinnerd3f08822012-05-29 12:57:52 +020013269int
13270_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13271 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020013272{
Victor Stinner6989ba02013-11-18 21:08:39 +010013273#ifdef MS_WINDOWS
13274 /* On Windows, overallocate by 50% is the best factor */
13275# define OVERALLOCATE_FACTOR 2
13276#else
13277 /* On Linux, overallocate by 25% is the best factor */
13278# define OVERALLOCATE_FACTOR 4
13279#endif
Victor Stinner202fdca2012-05-07 12:47:02 +020013280 Py_ssize_t newlen;
13281 PyObject *newbuffer;
13282
Victor Stinnerd3f08822012-05-29 12:57:52 +020013283 assert(length > 0);
13284
Victor Stinner202fdca2012-05-07 12:47:02 +020013285 if (length > PY_SSIZE_T_MAX - writer->pos) {
13286 PyErr_NoMemory();
13287 return -1;
13288 }
13289 newlen = writer->pos + length;
13290
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070013291 maxchar = Py_MAX(maxchar, writer->min_char);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013292
Victor Stinnerd3f08822012-05-29 12:57:52 +020013293 if (writer->buffer == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020013294 assert(!writer->readonly);
Victor Stinner6989ba02013-11-18 21:08:39 +010013295 if (writer->overallocate
13296 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13297 /* overallocate to limit the number of realloc() */
13298 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013299 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013300 if (newlen < writer->min_length)
13301 newlen = writer->min_length;
13302
Victor Stinnerd3f08822012-05-29 12:57:52 +020013303 writer->buffer = PyUnicode_New(newlen, maxchar);
13304 if (writer->buffer == NULL)
13305 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013306 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013307 else if (newlen > writer->size) {
Victor Stinner6989ba02013-11-18 21:08:39 +010013308 if (writer->overallocate
13309 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13310 /* overallocate to limit the number of realloc() */
13311 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013312 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013313 if (newlen < writer->min_length)
13314 newlen = writer->min_length;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013315
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013316 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020013317 /* resize + widen */
13318 newbuffer = PyUnicode_New(newlen, maxchar);
13319 if (newbuffer == NULL)
13320 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013321 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13322 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020013323 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013324 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020013325 }
13326 else {
13327 newbuffer = resize_compact(writer->buffer, newlen);
13328 if (newbuffer == NULL)
13329 return -1;
13330 }
13331 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013332 }
13333 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013334 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013335 newbuffer = PyUnicode_New(writer->size, maxchar);
13336 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020013337 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013338 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13339 writer->buffer, 0, writer->pos);
13340 Py_DECREF(writer->buffer);
13341 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013342 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013343 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013344 return 0;
Victor Stinner6989ba02013-11-18 21:08:39 +010013345
13346#undef OVERALLOCATE_FACTOR
Victor Stinner202fdca2012-05-07 12:47:02 +020013347}
13348
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013349Py_LOCAL_INLINE(int)
13350_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
Victor Stinnera0dd0212013-04-11 22:09:04 +020013351{
13352 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13353 return -1;
13354 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13355 writer->pos++;
13356 return 0;
13357}
13358
13359int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013360_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13361{
13362 return _PyUnicodeWriter_WriteCharInline(writer, ch);
13363}
13364
13365int
Victor Stinnerd3f08822012-05-29 12:57:52 +020013366_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13367{
13368 Py_UCS4 maxchar;
13369 Py_ssize_t len;
13370
13371 if (PyUnicode_READY(str) == -1)
13372 return -1;
13373 len = PyUnicode_GET_LENGTH(str);
13374 if (len == 0)
13375 return 0;
13376 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13377 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013378 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020013379 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013380 Py_INCREF(str);
13381 writer->buffer = str;
13382 _PyUnicodeWriter_Update(writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013383 writer->pos += len;
13384 return 0;
13385 }
13386 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13387 return -1;
13388 }
13389 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13390 str, 0, len);
13391 writer->pos += len;
13392 return 0;
13393}
13394
Victor Stinnere215d962012-10-06 23:03:36 +020013395int
Victor Stinnercfc4c132013-04-03 01:48:39 +020013396_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13397 Py_ssize_t start, Py_ssize_t end)
13398{
13399 Py_UCS4 maxchar;
13400 Py_ssize_t len;
13401
13402 if (PyUnicode_READY(str) == -1)
13403 return -1;
13404
13405 assert(0 <= start);
13406 assert(end <= PyUnicode_GET_LENGTH(str));
13407 assert(start <= end);
13408
13409 if (end == 0)
13410 return 0;
13411
13412 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
13413 return _PyUnicodeWriter_WriteStr(writer, str);
13414
13415 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
13416 maxchar = _PyUnicode_FindMaxChar(str, start, end);
13417 else
13418 maxchar = writer->maxchar;
13419 len = end - start;
13420
13421 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
13422 return -1;
13423
13424 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13425 str, start, len);
13426 writer->pos += len;
13427 return 0;
13428}
13429
13430int
Victor Stinner4a587072013-11-19 12:54:53 +010013431_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
13432 const char *ascii, Py_ssize_t len)
13433{
13434 if (len == -1)
13435 len = strlen(ascii);
13436
13437 assert(ucs1lib_find_max_char((Py_UCS1*)ascii, (Py_UCS1*)ascii + len) < 128);
13438
13439 if (writer->buffer == NULL && !writer->overallocate) {
13440 PyObject *str;
13441
13442 str = _PyUnicode_FromASCII(ascii, len);
13443 if (str == NULL)
13444 return -1;
13445
13446 writer->readonly = 1;
13447 writer->buffer = str;
13448 _PyUnicodeWriter_Update(writer);
13449 writer->pos += len;
13450 return 0;
13451 }
13452
13453 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
13454 return -1;
13455
13456 switch (writer->kind)
13457 {
13458 case PyUnicode_1BYTE_KIND:
13459 {
13460 const Py_UCS1 *str = (const Py_UCS1 *)ascii;
13461 Py_UCS1 *data = writer->data;
13462
13463 Py_MEMCPY(data + writer->pos, str, len);
13464 break;
13465 }
13466 case PyUnicode_2BYTE_KIND:
13467 {
13468 _PyUnicode_CONVERT_BYTES(
13469 Py_UCS1, Py_UCS2,
13470 ascii, ascii + len,
13471 (Py_UCS2 *)writer->data + writer->pos);
13472 break;
13473 }
13474 case PyUnicode_4BYTE_KIND:
13475 {
13476 _PyUnicode_CONVERT_BYTES(
13477 Py_UCS1, Py_UCS4,
13478 ascii, ascii + len,
13479 (Py_UCS4 *)writer->data + writer->pos);
13480 break;
13481 }
13482 default:
13483 assert(0);
13484 }
13485
13486 writer->pos += len;
13487 return 0;
13488}
13489
13490int
13491_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
13492 const char *str, Py_ssize_t len)
Victor Stinnere215d962012-10-06 23:03:36 +020013493{
13494 Py_UCS4 maxchar;
13495
13496 maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len);
13497 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
13498 return -1;
13499 unicode_write_cstr(writer->buffer, writer->pos, str, len);
13500 writer->pos += len;
13501 return 0;
13502}
13503
Victor Stinnerd3f08822012-05-29 12:57:52 +020013504PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013505_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013506{
Victor Stinner15a0bd32013-07-08 22:29:55 +020013507 PyObject *str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013508 if (writer->pos == 0) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013509 Py_CLEAR(writer->buffer);
Serhiy Storchaka678db842013-01-26 12:16:36 +020013510 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3f08822012-05-29 12:57:52 +020013511 }
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013512 if (writer->readonly) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013513 str = writer->buffer;
13514 writer->buffer = NULL;
13515 assert(PyUnicode_GET_LENGTH(str) == writer->pos);
13516 return str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013517 }
13518 if (PyUnicode_GET_LENGTH(writer->buffer) != writer->pos) {
13519 PyObject *newbuffer;
13520 newbuffer = resize_compact(writer->buffer, writer->pos);
13521 if (newbuffer == NULL) {
13522 Py_DECREF(writer->buffer);
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013523 writer->buffer = NULL;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013524 return NULL;
13525 }
13526 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013527 }
Victor Stinner15a0bd32013-07-08 22:29:55 +020013528 str = writer->buffer;
13529 writer->buffer = NULL;
13530 assert(_PyUnicode_CheckConsistency(str, 1));
13531 return unicode_result_ready(str);
Victor Stinner202fdca2012-05-07 12:47:02 +020013532}
13533
Victor Stinnerd3f08822012-05-29 12:57:52 +020013534void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013535_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013536{
13537 Py_CLEAR(writer->buffer);
13538}
13539
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013540#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000013541
13542PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013543 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013544\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013545Return a formatted version of S, using substitutions from args and kwargs.\n\
13546The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000013547
Eric Smith27bbca62010-11-04 17:06:58 +000013548PyDoc_STRVAR(format_map__doc__,
13549 "S.format_map(mapping) -> str\n\
13550\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013551Return a formatted version of S, using substitutions from mapping.\n\
13552The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000013553
Eric Smith4a7d76d2008-05-30 18:10:19 +000013554static PyObject *
13555unicode__format__(PyObject* self, PyObject* args)
13556{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013557 PyObject *format_spec;
13558 _PyUnicodeWriter writer;
13559 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000013560
13561 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
13562 return NULL;
13563
Victor Stinnerd3f08822012-05-29 12:57:52 +020013564 if (PyUnicode_READY(self) == -1)
13565 return NULL;
Victor Stinner8f674cc2013-04-17 23:02:17 +020013566 _PyUnicodeWriter_Init(&writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013567 ret = _PyUnicode_FormatAdvancedWriter(&writer,
13568 self, format_spec, 0,
13569 PyUnicode_GET_LENGTH(format_spec));
13570 if (ret == -1) {
13571 _PyUnicodeWriter_Dealloc(&writer);
13572 return NULL;
13573 }
13574 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000013575}
13576
Eric Smith8c663262007-08-25 02:26:07 +000013577PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013578 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013579\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013580Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000013581
13582static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013583unicode__sizeof__(PyObject *v)
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013584{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013585 Py_ssize_t size;
13586
13587 /* If it's a compact object, account for base structure +
13588 character data. */
13589 if (PyUnicode_IS_COMPACT_ASCII(v))
13590 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
13591 else if (PyUnicode_IS_COMPACT(v))
13592 size = sizeof(PyCompactUnicodeObject) +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013593 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013594 else {
13595 /* If it is a two-block object, account for base object, and
13596 for character block if present. */
13597 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020013598 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013599 size += (PyUnicode_GET_LENGTH(v) + 1) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013600 PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013601 }
13602 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020013603 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020013604 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013605 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020013606 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020013607 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013608
13609 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013610}
13611
13612PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013613 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013614
13615static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020013616unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013617{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010013618 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013619 if (!copy)
13620 return NULL;
13621 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013622}
13623
Guido van Rossumd57fd912000-03-10 22:53:23 +000013624static PyMethodDef unicode_methods[] = {
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000013625 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013626 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
Ezio Melotticda6b6d2012-02-26 09:39:55 +020013627 {"split", (PyCFunction) unicode_split, METH_VARARGS | METH_KEYWORDS, split__doc__},
13628 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS | METH_KEYWORDS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013629 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
13630 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
Benjamin Petersond5890c82012-01-14 13:23:30 -050013631 {"casefold", (PyCFunction) unicode_casefold, METH_NOARGS, casefold__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013632 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
13633 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
13634 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
Ezio Melotti745d54d2013-11-16 19:10:57 +020013635 {"expandtabs", (PyCFunction) unicode_expandtabs,
13636 METH_VARARGS | METH_KEYWORDS, expandtabs__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013637 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013638 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013639 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
13640 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
13641 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013642 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013643 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13644 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
13645 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013646 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013647 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Ezio Melotti745d54d2013-11-16 19:10:57 +020013648 {"splitlines", (PyCFunction) unicode_splitlines,
13649 METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013650 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013651 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
13652 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
13653 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
13654 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13655 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
13656 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
13657 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
13658 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
13659 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
13660 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
13661 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
13662 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
13663 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
13664 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000013665 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000013666 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013667 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000013668 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000013669 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000013670 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Larry Hastings31826802013-10-19 00:09:25 -070013671 UNICODE_MAKETRANS_METHODDEF
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013672 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000013673#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013674 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013675 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013676#endif
13677
Benjamin Peterson14339b62009-01-31 16:36:08 +000013678 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013679 {NULL, NULL}
13680};
13681
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013682static PyObject *
13683unicode_mod(PyObject *v, PyObject *w)
13684{
Brian Curtindfc80e32011-08-10 20:28:54 -050013685 if (!PyUnicode_Check(v))
13686 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000013687 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013688}
13689
13690static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013691 0, /*nb_add*/
13692 0, /*nb_subtract*/
13693 0, /*nb_multiply*/
13694 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013695};
13696
Guido van Rossumd57fd912000-03-10 22:53:23 +000013697static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013698 (lenfunc) unicode_length, /* sq_length */
13699 PyUnicode_Concat, /* sq_concat */
13700 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13701 (ssizeargfunc) unicode_getitem, /* sq_item */
13702 0, /* sq_slice */
13703 0, /* sq_ass_item */
13704 0, /* sq_ass_slice */
13705 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013706};
13707
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013708static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013709unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013710{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013711 if (PyUnicode_READY(self) == -1)
13712 return NULL;
13713
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013714 if (PyIndex_Check(item)) {
13715 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013716 if (i == -1 && PyErr_Occurred())
13717 return NULL;
13718 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013719 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013720 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013721 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000013722 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013723 PyObject *result;
13724 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013725 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013726 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013727
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013728 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000013729 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013730 return NULL;
13731 }
13732
13733 if (slicelength <= 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020013734 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013735 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010013736 slicelength == PyUnicode_GET_LENGTH(self)) {
13737 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000013738 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013739 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013740 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013741 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013742 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013743 src_kind = PyUnicode_KIND(self);
13744 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020013745 if (!PyUnicode_IS_ASCII(self)) {
13746 kind_limit = kind_maxchar_limit(src_kind);
13747 max_char = 0;
13748 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13749 ch = PyUnicode_READ(src_kind, src_data, cur);
13750 if (ch > max_char) {
13751 max_char = ch;
13752 if (max_char >= kind_limit)
13753 break;
13754 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013755 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013756 }
Victor Stinner55c99112011-10-13 01:17:06 +020013757 else
13758 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013759 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013760 if (result == NULL)
13761 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013762 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013763 dest_data = PyUnicode_DATA(result);
13764
13765 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013766 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13767 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013768 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013769 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013770 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013771 } else {
13772 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13773 return NULL;
13774 }
13775}
13776
13777static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013778 (lenfunc)unicode_length, /* mp_length */
13779 (binaryfunc)unicode_subscript, /* mp_subscript */
13780 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013781};
13782
Guido van Rossumd57fd912000-03-10 22:53:23 +000013783
Guido van Rossumd57fd912000-03-10 22:53:23 +000013784/* Helpers for PyUnicode_Format() */
13785
Victor Stinnera47082312012-10-04 02:19:54 +020013786struct unicode_formatter_t {
13787 PyObject *args;
13788 int args_owned;
13789 Py_ssize_t arglen, argidx;
13790 PyObject *dict;
13791
13792 enum PyUnicode_Kind fmtkind;
13793 Py_ssize_t fmtcnt, fmtpos;
13794 void *fmtdata;
13795 PyObject *fmtstr;
13796
13797 _PyUnicodeWriter writer;
13798};
13799
13800struct unicode_format_arg_t {
13801 Py_UCS4 ch;
13802 int flags;
13803 Py_ssize_t width;
13804 int prec;
13805 int sign;
13806};
13807
Guido van Rossumd57fd912000-03-10 22:53:23 +000013808static PyObject *
Victor Stinnera47082312012-10-04 02:19:54 +020013809unicode_format_getnextarg(struct unicode_formatter_t *ctx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013810{
Victor Stinnera47082312012-10-04 02:19:54 +020013811 Py_ssize_t argidx = ctx->argidx;
13812
13813 if (argidx < ctx->arglen) {
13814 ctx->argidx++;
13815 if (ctx->arglen < 0)
13816 return ctx->args;
Benjamin Peterson29060642009-01-31 22:14:21 +000013817 else
Victor Stinnera47082312012-10-04 02:19:54 +020013818 return PyTuple_GetItem(ctx->args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013819 }
13820 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013821 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013822 return NULL;
13823}
13824
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013825/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013826
Victor Stinnera47082312012-10-04 02:19:54 +020013827/* Format a float into the writer if the writer is not NULL, or into *p_output
13828 otherwise.
13829
13830 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +020013831static int
Victor Stinnera47082312012-10-04 02:19:54 +020013832formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
13833 PyObject **p_output,
13834 _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013835{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013836 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013837 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013838 Py_ssize_t len;
Victor Stinnera47082312012-10-04 02:19:54 +020013839 int prec;
13840 int dtoa_flags;
Tim Petersced69f82003-09-16 20:30:58 +000013841
Guido van Rossumd57fd912000-03-10 22:53:23 +000013842 x = PyFloat_AsDouble(v);
13843 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020013844 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013845
Victor Stinnera47082312012-10-04 02:19:54 +020013846 prec = arg->prec;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013847 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013848 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000013849
Victor Stinnera47082312012-10-04 02:19:54 +020013850 if (arg->flags & F_ALT)
13851 dtoa_flags = Py_DTSF_ALT;
13852 else
13853 dtoa_flags = 0;
13854 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013855 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020013856 return -1;
13857 len = strlen(p);
13858 if (writer) {
Victor Stinner4a587072013-11-19 12:54:53 +010013859 if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
Christian Heimesf4f99392012-09-10 11:48:41 +020013860 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013861 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020013862 }
Victor Stinnerd3f08822012-05-29 12:57:52 +020013863 }
13864 else
13865 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000013866 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013867 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013868}
13869
Victor Stinnerd0880d52012-04-27 23:40:13 +020013870/* formatlong() emulates the format codes d, u, o, x and X, and
13871 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
13872 * Python's regular ints.
13873 * Return value: a new PyUnicodeObject*, or NULL if error.
13874 * The output string is of the form
13875 * "-"? ("0x" | "0X")? digit+
13876 * "0x"/"0X" are present only for x and X conversions, with F_ALT
13877 * set in flags. The case of hex digits will be correct,
13878 * There will be at least prec digits, zero-filled on the left if
13879 * necessary to get that many.
13880 * val object to be converted
13881 * flags bitmask of format flags; only F_ALT is looked at
13882 * prec minimum number of digits; 0-fill on left if needed
13883 * type a character in [duoxX]; u acts the same as d
13884 *
13885 * CAUTION: o, x and X conversions on regular ints can never
13886 * produce a '-' sign, but can for Python's unbounded ints.
13887 */
Tim Peters38fd5b62000-09-21 05:43:11 +000013888static PyObject*
Victor Stinnera47082312012-10-04 02:19:54 +020013889formatlong(PyObject *val, struct unicode_format_arg_t *arg)
Tim Peters38fd5b62000-09-21 05:43:11 +000013890{
Victor Stinnerd0880d52012-04-27 23:40:13 +020013891 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013892 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013893 Py_ssize_t i;
13894 int sign; /* 1 if '-', else 0 */
13895 int len; /* number of characters */
13896 Py_ssize_t llen;
13897 int numdigits; /* len == numnondigits + numdigits */
13898 int numnondigits = 0;
Victor Stinnera47082312012-10-04 02:19:54 +020013899 int prec = arg->prec;
13900 int type = arg->ch;
Tim Peters38fd5b62000-09-21 05:43:11 +000013901
Victor Stinnerd0880d52012-04-27 23:40:13 +020013902 /* Avoid exceeding SSIZE_T_MAX */
13903 if (prec > INT_MAX-3) {
13904 PyErr_SetString(PyExc_OverflowError,
13905 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000013906 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013907 }
13908
13909 assert(PyLong_Check(val));
13910
13911 switch (type) {
Victor Stinner621ef3d2012-10-02 00:33:47 +020013912 default:
13913 assert(!"'type' not in [diuoxX]");
Victor Stinnerd0880d52012-04-27 23:40:13 +020013914 case 'd':
Victor Stinner621ef3d2012-10-02 00:33:47 +020013915 case 'i':
Victor Stinnerd0880d52012-04-27 23:40:13 +020013916 case 'u':
Ethan Furmanfb137212013-08-31 10:18:55 -070013917 /* int and int subclasses should print numerically when a numeric */
13918 /* format code is used (see issue18780) */
13919 result = PyNumber_ToBase(val, 10);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013920 break;
13921 case 'o':
13922 numnondigits = 2;
13923 result = PyNumber_ToBase(val, 8);
13924 break;
13925 case 'x':
13926 case 'X':
13927 numnondigits = 2;
13928 result = PyNumber_ToBase(val, 16);
13929 break;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013930 }
13931 if (!result)
13932 return NULL;
13933
13934 assert(unicode_modifiable(result));
13935 assert(PyUnicode_IS_READY(result));
13936 assert(PyUnicode_IS_ASCII(result));
13937
13938 /* To modify the string in-place, there can only be one reference. */
13939 if (Py_REFCNT(result) != 1) {
Christian Heimesd47802e2013-06-29 21:33:36 +020013940 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013941 PyErr_BadInternalCall();
13942 return NULL;
13943 }
13944 buf = PyUnicode_DATA(result);
13945 llen = PyUnicode_GET_LENGTH(result);
13946 if (llen > INT_MAX) {
Christian Heimesd47802e2013-06-29 21:33:36 +020013947 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013948 PyErr_SetString(PyExc_ValueError,
13949 "string too large in _PyBytes_FormatLong");
13950 return NULL;
13951 }
13952 len = (int)llen;
13953 sign = buf[0] == '-';
13954 numnondigits += sign;
13955 numdigits = len - numnondigits;
13956 assert(numdigits > 0);
13957
13958 /* Get rid of base marker unless F_ALT */
Victor Stinnera47082312012-10-04 02:19:54 +020013959 if (((arg->flags & F_ALT) == 0 &&
Victor Stinnerd0880d52012-04-27 23:40:13 +020013960 (type == 'o' || type == 'x' || type == 'X'))) {
13961 assert(buf[sign] == '0');
13962 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
13963 buf[sign+1] == 'o');
13964 numnondigits -= 2;
13965 buf += 2;
13966 len -= 2;
13967 if (sign)
13968 buf[0] = '-';
13969 assert(len == numnondigits + numdigits);
13970 assert(numdigits > 0);
13971 }
13972
13973 /* Fill with leading zeroes to meet minimum width. */
13974 if (prec > numdigits) {
13975 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
13976 numnondigits + prec);
13977 char *b1;
13978 if (!r1) {
13979 Py_DECREF(result);
13980 return NULL;
13981 }
13982 b1 = PyBytes_AS_STRING(r1);
13983 for (i = 0; i < numnondigits; ++i)
13984 *b1++ = *buf++;
13985 for (i = 0; i < prec - numdigits; i++)
13986 *b1++ = '0';
13987 for (i = 0; i < numdigits; i++)
13988 *b1++ = *buf++;
13989 *b1 = '\0';
13990 Py_DECREF(result);
13991 result = r1;
13992 buf = PyBytes_AS_STRING(result);
13993 len = numnondigits + prec;
13994 }
13995
13996 /* Fix up case for hex conversions. */
13997 if (type == 'X') {
13998 /* Need to convert all lower case letters to upper case.
13999 and need to convert 0x to 0X (and -0x to -0X). */
14000 for (i = 0; i < len; i++)
14001 if (buf[i] >= 'a' && buf[i] <= 'x')
14002 buf[i] -= 'a'-'A';
14003 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014004 if (!PyUnicode_Check(result)
14005 || buf != PyUnicode_DATA(result)) {
Victor Stinnerd0880d52012-04-27 23:40:13 +020014006 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014007 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014008 Py_DECREF(result);
14009 result = unicode;
14010 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014011 else if (len != PyUnicode_GET_LENGTH(result)) {
14012 if (PyUnicode_Resize(&result, len) < 0)
14013 Py_CLEAR(result);
14014 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000014015 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000014016}
14017
Victor Stinner621ef3d2012-10-02 00:33:47 +020014018/* Format an integer.
14019 * Return 1 if the number has been formatted into the writer,
Victor Stinnera47082312012-10-04 02:19:54 +020014020 * 0 if the number has been formatted into *p_output
Victor Stinner621ef3d2012-10-02 00:33:47 +020014021 * -1 and raise an exception on error */
14022static int
Victor Stinnera47082312012-10-04 02:19:54 +020014023mainformatlong(PyObject *v,
14024 struct unicode_format_arg_t *arg,
14025 PyObject **p_output,
14026 _PyUnicodeWriter *writer)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014027{
14028 PyObject *iobj, *res;
Victor Stinnera47082312012-10-04 02:19:54 +020014029 char type = (char)arg->ch;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014030
14031 if (!PyNumber_Check(v))
14032 goto wrongtype;
14033
14034 if (!PyLong_Check(v)) {
14035 iobj = PyNumber_Long(v);
14036 if (iobj == NULL) {
14037 if (PyErr_ExceptionMatches(PyExc_TypeError))
14038 goto wrongtype;
14039 return -1;
14040 }
14041 assert(PyLong_Check(iobj));
14042 }
14043 else {
14044 iobj = v;
14045 Py_INCREF(iobj);
14046 }
14047
14048 if (PyLong_CheckExact(v)
Victor Stinnera47082312012-10-04 02:19:54 +020014049 && arg->width == -1 && arg->prec == -1
14050 && !(arg->flags & (F_SIGN | F_BLANK))
14051 && type != 'X')
Victor Stinner621ef3d2012-10-02 00:33:47 +020014052 {
14053 /* Fast path */
Victor Stinnera47082312012-10-04 02:19:54 +020014054 int alternate = arg->flags & F_ALT;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014055 int base;
14056
Victor Stinnera47082312012-10-04 02:19:54 +020014057 switch(type)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014058 {
14059 default:
14060 assert(0 && "'type' not in [diuoxX]");
14061 case 'd':
14062 case 'i':
14063 case 'u':
14064 base = 10;
14065 break;
14066 case 'o':
14067 base = 8;
14068 break;
14069 case 'x':
14070 case 'X':
14071 base = 16;
14072 break;
14073 }
14074
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014075 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14076 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014077 return -1;
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014078 }
14079 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014080 return 1;
14081 }
14082
Victor Stinnera47082312012-10-04 02:19:54 +020014083 res = formatlong(iobj, arg);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014084 Py_DECREF(iobj);
14085 if (res == NULL)
14086 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014087 *p_output = res;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014088 return 0;
14089
14090wrongtype:
14091 PyErr_Format(PyExc_TypeError,
14092 "%%%c format: a number is required, "
Victor Stinnera47082312012-10-04 02:19:54 +020014093 "not %.200s",
14094 type, Py_TYPE(v)->tp_name);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014095 return -1;
14096}
14097
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014098static Py_UCS4
14099formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014100{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014101 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014102 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014103 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014104 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000014105 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014106 goto onError;
14107 }
14108 else {
14109 /* Integer input truncated to a character */
14110 long x;
14111 x = PyLong_AsLong(v);
14112 if (x == -1 && PyErr_Occurred())
14113 goto onError;
14114
Victor Stinner8faf8212011-12-08 22:14:11 +010014115 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014116 PyErr_SetString(PyExc_OverflowError,
14117 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014118 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000014119 }
14120
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014121 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014122 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014123
Benjamin Peterson29060642009-01-31 22:14:21 +000014124 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014125 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014126 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014127 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014128}
14129
Victor Stinnera47082312012-10-04 02:19:54 +020014130/* Parse options of an argument: flags, width, precision.
14131 Handle also "%(name)" syntax.
14132
14133 Return 0 if the argument has been formatted into arg->str.
14134 Return 1 if the argument has been written into ctx->writer,
14135 Raise an exception and return -1 on error. */
14136static int
14137unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14138 struct unicode_format_arg_t *arg)
14139{
14140#define FORMAT_READ(ctx) \
14141 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14142
14143 PyObject *v;
14144
Victor Stinnera47082312012-10-04 02:19:54 +020014145 if (arg->ch == '(') {
14146 /* Get argument value from a dictionary. Example: "%(name)s". */
14147 Py_ssize_t keystart;
14148 Py_ssize_t keylen;
14149 PyObject *key;
14150 int pcount = 1;
14151
14152 if (ctx->dict == NULL) {
14153 PyErr_SetString(PyExc_TypeError,
14154 "format requires a mapping");
14155 return -1;
14156 }
14157 ++ctx->fmtpos;
14158 --ctx->fmtcnt;
14159 keystart = ctx->fmtpos;
14160 /* Skip over balanced parentheses */
14161 while (pcount > 0 && --ctx->fmtcnt >= 0) {
14162 arg->ch = FORMAT_READ(ctx);
14163 if (arg->ch == ')')
14164 --pcount;
14165 else if (arg->ch == '(')
14166 ++pcount;
14167 ctx->fmtpos++;
14168 }
14169 keylen = ctx->fmtpos - keystart - 1;
14170 if (ctx->fmtcnt < 0 || pcount > 0) {
14171 PyErr_SetString(PyExc_ValueError,
14172 "incomplete format key");
14173 return -1;
14174 }
14175 key = PyUnicode_Substring(ctx->fmtstr,
14176 keystart, keystart + keylen);
14177 if (key == NULL)
14178 return -1;
14179 if (ctx->args_owned) {
14180 Py_DECREF(ctx->args);
14181 ctx->args_owned = 0;
14182 }
14183 ctx->args = PyObject_GetItem(ctx->dict, key);
14184 Py_DECREF(key);
14185 if (ctx->args == NULL)
14186 return -1;
14187 ctx->args_owned = 1;
14188 ctx->arglen = -1;
14189 ctx->argidx = -2;
14190 }
14191
14192 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
Victor Stinnera47082312012-10-04 02:19:54 +020014193 while (--ctx->fmtcnt >= 0) {
14194 arg->ch = FORMAT_READ(ctx);
14195 ctx->fmtpos++;
14196 switch (arg->ch) {
14197 case '-': arg->flags |= F_LJUST; continue;
14198 case '+': arg->flags |= F_SIGN; continue;
14199 case ' ': arg->flags |= F_BLANK; continue;
14200 case '#': arg->flags |= F_ALT; continue;
14201 case '0': arg->flags |= F_ZERO; continue;
14202 }
14203 break;
14204 }
14205
14206 /* Parse width. Example: "%10s" => width=10 */
Victor Stinnera47082312012-10-04 02:19:54 +020014207 if (arg->ch == '*') {
14208 v = unicode_format_getnextarg(ctx);
14209 if (v == NULL)
14210 return -1;
14211 if (!PyLong_Check(v)) {
14212 PyErr_SetString(PyExc_TypeError,
14213 "* wants int");
14214 return -1;
14215 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014216 arg->width = PyLong_AsSsize_t(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014217 if (arg->width == -1 && PyErr_Occurred())
14218 return -1;
14219 if (arg->width < 0) {
14220 arg->flags |= F_LJUST;
14221 arg->width = -arg->width;
14222 }
14223 if (--ctx->fmtcnt >= 0) {
14224 arg->ch = FORMAT_READ(ctx);
14225 ctx->fmtpos++;
14226 }
14227 }
14228 else if (arg->ch >= '0' && arg->ch <= '9') {
14229 arg->width = arg->ch - '0';
14230 while (--ctx->fmtcnt >= 0) {
14231 arg->ch = FORMAT_READ(ctx);
14232 ctx->fmtpos++;
14233 if (arg->ch < '0' || arg->ch > '9')
14234 break;
14235 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14236 mixing signed and unsigned comparison. Since arg->ch is between
14237 '0' and '9', casting to int is safe. */
14238 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14239 PyErr_SetString(PyExc_ValueError,
14240 "width too big");
14241 return -1;
14242 }
14243 arg->width = arg->width*10 + (arg->ch - '0');
14244 }
14245 }
14246
14247 /* Parse precision. Example: "%.3f" => prec=3 */
Victor Stinnera47082312012-10-04 02:19:54 +020014248 if (arg->ch == '.') {
14249 arg->prec = 0;
14250 if (--ctx->fmtcnt >= 0) {
14251 arg->ch = FORMAT_READ(ctx);
14252 ctx->fmtpos++;
14253 }
14254 if (arg->ch == '*') {
14255 v = unicode_format_getnextarg(ctx);
14256 if (v == NULL)
14257 return -1;
14258 if (!PyLong_Check(v)) {
14259 PyErr_SetString(PyExc_TypeError,
14260 "* wants int");
14261 return -1;
14262 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014263 arg->prec = _PyLong_AsInt(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014264 if (arg->prec == -1 && PyErr_Occurred())
14265 return -1;
14266 if (arg->prec < 0)
14267 arg->prec = 0;
14268 if (--ctx->fmtcnt >= 0) {
14269 arg->ch = FORMAT_READ(ctx);
14270 ctx->fmtpos++;
14271 }
14272 }
14273 else if (arg->ch >= '0' && arg->ch <= '9') {
14274 arg->prec = arg->ch - '0';
14275 while (--ctx->fmtcnt >= 0) {
14276 arg->ch = FORMAT_READ(ctx);
14277 ctx->fmtpos++;
14278 if (arg->ch < '0' || arg->ch > '9')
14279 break;
14280 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
14281 PyErr_SetString(PyExc_ValueError,
Victor Stinner3921e902012-10-06 23:05:00 +020014282 "precision too big");
Victor Stinnera47082312012-10-04 02:19:54 +020014283 return -1;
14284 }
14285 arg->prec = arg->prec*10 + (arg->ch - '0');
14286 }
14287 }
14288 }
14289
14290 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
14291 if (ctx->fmtcnt >= 0) {
14292 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
14293 if (--ctx->fmtcnt >= 0) {
14294 arg->ch = FORMAT_READ(ctx);
14295 ctx->fmtpos++;
14296 }
14297 }
14298 }
14299 if (ctx->fmtcnt < 0) {
14300 PyErr_SetString(PyExc_ValueError,
14301 "incomplete format");
14302 return -1;
14303 }
14304 return 0;
14305
14306#undef FORMAT_READ
14307}
14308
14309/* Format one argument. Supported conversion specifiers:
14310
14311 - "s", "r", "a": any type
14312 - "i", "d", "u", "o", "x", "X": int
14313 - "e", "E", "f", "F", "g", "G": float
14314 - "c": int or str (1 character)
14315
Victor Stinner8dbd4212012-12-04 09:30:24 +010014316 When possible, the output is written directly into the Unicode writer
14317 (ctx->writer). A string is created when padding is required.
14318
Victor Stinnera47082312012-10-04 02:19:54 +020014319 Return 0 if the argument has been formatted into *p_str,
14320 1 if the argument has been written into ctx->writer,
Victor Stinner8dbd4212012-12-04 09:30:24 +010014321 -1 on error. */
Victor Stinnera47082312012-10-04 02:19:54 +020014322static int
14323unicode_format_arg_format(struct unicode_formatter_t *ctx,
14324 struct unicode_format_arg_t *arg,
14325 PyObject **p_str)
14326{
14327 PyObject *v;
14328 _PyUnicodeWriter *writer = &ctx->writer;
14329
14330 if (ctx->fmtcnt == 0)
14331 ctx->writer.overallocate = 0;
14332
14333 if (arg->ch == '%') {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014334 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014335 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014336 return 1;
14337 }
14338
14339 v = unicode_format_getnextarg(ctx);
14340 if (v == NULL)
14341 return -1;
14342
Victor Stinnera47082312012-10-04 02:19:54 +020014343
14344 switch (arg->ch) {
Victor Stinnera47082312012-10-04 02:19:54 +020014345 case 's':
14346 case 'r':
14347 case 'a':
14348 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
14349 /* Fast path */
14350 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
14351 return -1;
14352 return 1;
14353 }
14354
14355 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
14356 *p_str = v;
14357 Py_INCREF(*p_str);
14358 }
14359 else {
14360 if (arg->ch == 's')
14361 *p_str = PyObject_Str(v);
14362 else if (arg->ch == 'r')
14363 *p_str = PyObject_Repr(v);
14364 else
14365 *p_str = PyObject_ASCII(v);
14366 }
14367 break;
14368
14369 case 'i':
14370 case 'd':
14371 case 'u':
14372 case 'o':
14373 case 'x':
14374 case 'X':
14375 {
14376 int ret = mainformatlong(v, arg, p_str, writer);
14377 if (ret != 0)
14378 return ret;
14379 arg->sign = 1;
14380 break;
14381 }
14382
14383 case 'e':
14384 case 'E':
14385 case 'f':
14386 case 'F':
14387 case 'g':
14388 case 'G':
14389 if (arg->width == -1 && arg->prec == -1
14390 && !(arg->flags & (F_SIGN | F_BLANK)))
14391 {
14392 /* Fast path */
14393 if (formatfloat(v, arg, NULL, writer) == -1)
14394 return -1;
14395 return 1;
14396 }
14397
14398 arg->sign = 1;
14399 if (formatfloat(v, arg, p_str, NULL) == -1)
14400 return -1;
14401 break;
14402
14403 case 'c':
14404 {
14405 Py_UCS4 ch = formatchar(v);
14406 if (ch == (Py_UCS4) -1)
14407 return -1;
14408 if (arg->width == -1 && arg->prec == -1) {
14409 /* Fast path */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014410 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014411 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014412 return 1;
14413 }
14414 *p_str = PyUnicode_FromOrdinal(ch);
14415 break;
14416 }
14417
14418 default:
14419 PyErr_Format(PyExc_ValueError,
14420 "unsupported format character '%c' (0x%x) "
14421 "at index %zd",
14422 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
14423 (int)arg->ch,
14424 ctx->fmtpos - 1);
14425 return -1;
14426 }
14427 if (*p_str == NULL)
14428 return -1;
14429 assert (PyUnicode_Check(*p_str));
14430 return 0;
14431}
14432
14433static int
14434unicode_format_arg_output(struct unicode_formatter_t *ctx,
14435 struct unicode_format_arg_t *arg,
14436 PyObject *str)
14437{
14438 Py_ssize_t len;
14439 enum PyUnicode_Kind kind;
14440 void *pbuf;
14441 Py_ssize_t pindex;
14442 Py_UCS4 signchar;
14443 Py_ssize_t buflen;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014444 Py_UCS4 maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014445 Py_ssize_t sublen;
14446 _PyUnicodeWriter *writer = &ctx->writer;
14447 Py_UCS4 fill;
14448
14449 fill = ' ';
14450 if (arg->sign && arg->flags & F_ZERO)
14451 fill = '0';
14452
14453 if (PyUnicode_READY(str) == -1)
14454 return -1;
14455
14456 len = PyUnicode_GET_LENGTH(str);
14457 if ((arg->width == -1 || arg->width <= len)
14458 && (arg->prec == -1 || arg->prec >= len)
14459 && !(arg->flags & (F_SIGN | F_BLANK)))
14460 {
14461 /* Fast path */
14462 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
14463 return -1;
14464 return 0;
14465 }
14466
14467 /* Truncate the string for "s", "r" and "a" formats
14468 if the precision is set */
14469 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
14470 if (arg->prec >= 0 && len > arg->prec)
14471 len = arg->prec;
14472 }
14473
14474 /* Adjust sign and width */
14475 kind = PyUnicode_KIND(str);
14476 pbuf = PyUnicode_DATA(str);
14477 pindex = 0;
14478 signchar = '\0';
14479 if (arg->sign) {
14480 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
14481 if (ch == '-' || ch == '+') {
14482 signchar = ch;
14483 len--;
14484 pindex++;
14485 }
14486 else if (arg->flags & F_SIGN)
14487 signchar = '+';
14488 else if (arg->flags & F_BLANK)
14489 signchar = ' ';
14490 else
14491 arg->sign = 0;
14492 }
14493 if (arg->width < len)
14494 arg->width = len;
14495
14496 /* Prepare the writer */
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014497 maxchar = writer->maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014498 if (!(arg->flags & F_LJUST)) {
14499 if (arg->sign) {
14500 if ((arg->width-1) > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014501 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014502 }
14503 else {
14504 if (arg->width > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014505 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014506 }
14507 }
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014508 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
14509 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014510 maxchar = Py_MAX(maxchar, strmaxchar);
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014511 }
14512
Victor Stinnera47082312012-10-04 02:19:54 +020014513 buflen = arg->width;
14514 if (arg->sign && len == arg->width)
14515 buflen++;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014516 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
Victor Stinnera47082312012-10-04 02:19:54 +020014517 return -1;
14518
14519 /* Write the sign if needed */
14520 if (arg->sign) {
14521 if (fill != ' ') {
14522 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14523 writer->pos += 1;
14524 }
14525 if (arg->width > len)
14526 arg->width--;
14527 }
14528
14529 /* Write the numeric prefix for "x", "X" and "o" formats
14530 if the alternate form is used.
14531 For example, write "0x" for the "%#x" format. */
14532 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14533 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14534 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
14535 if (fill != ' ') {
14536 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14537 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14538 writer->pos += 2;
14539 pindex += 2;
14540 }
14541 arg->width -= 2;
14542 if (arg->width < 0)
14543 arg->width = 0;
14544 len -= 2;
14545 }
14546
14547 /* Pad left with the fill character if needed */
14548 if (arg->width > len && !(arg->flags & F_LJUST)) {
14549 sublen = arg->width - len;
14550 FILL(writer->kind, writer->data, fill, writer->pos, sublen);
14551 writer->pos += sublen;
14552 arg->width = len;
14553 }
14554
14555 /* If padding with spaces: write sign if needed and/or numeric prefix if
14556 the alternate form is used */
14557 if (fill == ' ') {
14558 if (arg->sign) {
14559 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14560 writer->pos += 1;
14561 }
14562 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14563 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14564 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
14565 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14566 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14567 writer->pos += 2;
14568 pindex += 2;
14569 }
14570 }
14571
14572 /* Write characters */
14573 if (len) {
14574 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14575 str, pindex, len);
14576 writer->pos += len;
14577 }
14578
14579 /* Pad right with the fill character if needed */
14580 if (arg->width > len) {
14581 sublen = arg->width - len;
14582 FILL(writer->kind, writer->data, ' ', writer->pos, sublen);
14583 writer->pos += sublen;
14584 }
14585 return 0;
14586}
14587
14588/* Helper of PyUnicode_Format(): format one arg.
14589 Return 0 on success, raise an exception and return -1 on error. */
14590static int
14591unicode_format_arg(struct unicode_formatter_t *ctx)
14592{
14593 struct unicode_format_arg_t arg;
14594 PyObject *str;
14595 int ret;
14596
Victor Stinner8dbd4212012-12-04 09:30:24 +010014597 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
14598 arg.flags = 0;
14599 arg.width = -1;
14600 arg.prec = -1;
14601 arg.sign = 0;
14602 str = NULL;
14603
Victor Stinnera47082312012-10-04 02:19:54 +020014604 ret = unicode_format_arg_parse(ctx, &arg);
14605 if (ret == -1)
14606 return -1;
14607
14608 ret = unicode_format_arg_format(ctx, &arg, &str);
14609 if (ret == -1)
14610 return -1;
14611
14612 if (ret != 1) {
14613 ret = unicode_format_arg_output(ctx, &arg, str);
14614 Py_DECREF(str);
14615 if (ret == -1)
14616 return -1;
14617 }
14618
14619 if (ctx->dict && (ctx->argidx < ctx->arglen) && arg.ch != '%') {
14620 PyErr_SetString(PyExc_TypeError,
14621 "not all arguments converted during string formatting");
14622 return -1;
14623 }
14624 return 0;
14625}
14626
Alexander Belopolsky40018472011-02-26 01:02:56 +000014627PyObject *
14628PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014629{
Victor Stinnera47082312012-10-04 02:19:54 +020014630 struct unicode_formatter_t ctx;
Tim Petersced69f82003-09-16 20:30:58 +000014631
Guido van Rossumd57fd912000-03-10 22:53:23 +000014632 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014633 PyErr_BadInternalCall();
14634 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014635 }
Victor Stinnera47082312012-10-04 02:19:54 +020014636
14637 ctx.fmtstr = PyUnicode_FromObject(format);
14638 if (ctx.fmtstr == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000014639 return NULL;
Victor Stinnera47082312012-10-04 02:19:54 +020014640 if (PyUnicode_READY(ctx.fmtstr) == -1) {
14641 Py_DECREF(ctx.fmtstr);
14642 return NULL;
14643 }
14644 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
14645 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
14646 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
14647 ctx.fmtpos = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014648
Victor Stinner8f674cc2013-04-17 23:02:17 +020014649 _PyUnicodeWriter_Init(&ctx.writer);
14650 ctx.writer.min_length = ctx.fmtcnt + 100;
14651 ctx.writer.overallocate = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014652
Guido van Rossumd57fd912000-03-10 22:53:23 +000014653 if (PyTuple_Check(args)) {
Victor Stinnera47082312012-10-04 02:19:54 +020014654 ctx.arglen = PyTuple_Size(args);
14655 ctx.argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014656 }
14657 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014658 ctx.arglen = -1;
14659 ctx.argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014660 }
Victor Stinnera47082312012-10-04 02:19:54 +020014661 ctx.args_owned = 0;
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040014662 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Victor Stinnera47082312012-10-04 02:19:54 +020014663 ctx.dict = args;
14664 else
14665 ctx.dict = NULL;
14666 ctx.args = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014667
Victor Stinnera47082312012-10-04 02:19:54 +020014668 while (--ctx.fmtcnt >= 0) {
14669 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
Victor Stinnercfc4c132013-04-03 01:48:39 +020014670 Py_ssize_t nonfmtpos;
Victor Stinnera47082312012-10-04 02:19:54 +020014671
14672 nonfmtpos = ctx.fmtpos++;
14673 while (ctx.fmtcnt >= 0 &&
14674 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
14675 ctx.fmtpos++;
14676 ctx.fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014677 }
Victor Stinnera47082312012-10-04 02:19:54 +020014678 if (ctx.fmtcnt < 0) {
14679 ctx.fmtpos--;
14680 ctx.writer.overallocate = 0;
Victor Stinnera0494432012-10-03 23:03:46 +020014681 }
Victor Stinneree4544c2012-05-09 22:24:08 +020014682
Victor Stinnercfc4c132013-04-03 01:48:39 +020014683 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
14684 nonfmtpos, ctx.fmtpos) < 0)
14685 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014686 }
14687 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014688 ctx.fmtpos++;
14689 if (unicode_format_arg(&ctx) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000014690 goto onError;
Victor Stinnera47082312012-10-04 02:19:54 +020014691 }
14692 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020014693
Victor Stinnera47082312012-10-04 02:19:54 +020014694 if (ctx.argidx < ctx.arglen && !ctx.dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014695 PyErr_SetString(PyExc_TypeError,
14696 "not all arguments converted during string formatting");
14697 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014698 }
14699
Victor Stinnera47082312012-10-04 02:19:54 +020014700 if (ctx.args_owned) {
14701 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014702 }
Victor Stinnera47082312012-10-04 02:19:54 +020014703 Py_DECREF(ctx.fmtstr);
14704 return _PyUnicodeWriter_Finish(&ctx.writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014705
Benjamin Peterson29060642009-01-31 22:14:21 +000014706 onError:
Victor Stinnera47082312012-10-04 02:19:54 +020014707 Py_DECREF(ctx.fmtstr);
14708 _PyUnicodeWriter_Dealloc(&ctx.writer);
14709 if (ctx.args_owned) {
14710 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014711 }
14712 return NULL;
14713}
14714
Jeremy Hylton938ace62002-07-17 16:30:39 +000014715static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000014716unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
14717
Tim Peters6d6c1a32001-08-02 04:15:00 +000014718static PyObject *
14719unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14720{
Benjamin Peterson29060642009-01-31 22:14:21 +000014721 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014722 static char *kwlist[] = {"object", "encoding", "errors", 0};
14723 char *encoding = NULL;
14724 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000014725
Benjamin Peterson14339b62009-01-31 16:36:08 +000014726 if (type != &PyUnicode_Type)
14727 return unicode_subtype_new(type, args, kwds);
14728 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000014729 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000014730 return NULL;
14731 if (x == NULL)
Serhiy Storchaka678db842013-01-26 12:16:36 +020014732 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson14339b62009-01-31 16:36:08 +000014733 if (encoding == NULL && errors == NULL)
14734 return PyObject_Str(x);
14735 else
Benjamin Peterson29060642009-01-31 22:14:21 +000014736 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000014737}
14738
Guido van Rossume023fe02001-08-30 03:12:59 +000014739static PyObject *
14740unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14741{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014742 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014743 Py_ssize_t length, char_size;
14744 int share_wstr, share_utf8;
14745 unsigned int kind;
14746 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000014747
Benjamin Peterson14339b62009-01-31 16:36:08 +000014748 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014749
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014750 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014751 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014752 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014753 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050014754 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060014755 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014756 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060014757 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014758
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014759 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014760 if (self == NULL) {
14761 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014762 return NULL;
14763 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014764 kind = PyUnicode_KIND(unicode);
14765 length = PyUnicode_GET_LENGTH(unicode);
14766
14767 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014768#ifdef Py_DEBUG
14769 _PyUnicode_HASH(self) = -1;
14770#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014771 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014772#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014773 _PyUnicode_STATE(self).interned = 0;
14774 _PyUnicode_STATE(self).kind = kind;
14775 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020014776 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014777 _PyUnicode_STATE(self).ready = 1;
14778 _PyUnicode_WSTR(self) = NULL;
14779 _PyUnicode_UTF8_LENGTH(self) = 0;
14780 _PyUnicode_UTF8(self) = NULL;
14781 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020014782 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014783
14784 share_utf8 = 0;
14785 share_wstr = 0;
14786 if (kind == PyUnicode_1BYTE_KIND) {
14787 char_size = 1;
14788 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
14789 share_utf8 = 1;
14790 }
14791 else if (kind == PyUnicode_2BYTE_KIND) {
14792 char_size = 2;
14793 if (sizeof(wchar_t) == 2)
14794 share_wstr = 1;
14795 }
14796 else {
14797 assert(kind == PyUnicode_4BYTE_KIND);
14798 char_size = 4;
14799 if (sizeof(wchar_t) == 4)
14800 share_wstr = 1;
14801 }
14802
14803 /* Ensure we won't overflow the length. */
14804 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
14805 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014806 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014807 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014808 data = PyObject_MALLOC((length + 1) * char_size);
14809 if (data == NULL) {
14810 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014811 goto onError;
14812 }
14813
Victor Stinnerc3c74152011-10-02 20:39:55 +020014814 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014815 if (share_utf8) {
14816 _PyUnicode_UTF8_LENGTH(self) = length;
14817 _PyUnicode_UTF8(self) = data;
14818 }
14819 if (share_wstr) {
14820 _PyUnicode_WSTR_LENGTH(self) = length;
14821 _PyUnicode_WSTR(self) = (wchar_t *)data;
14822 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014823
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014824 Py_MEMCPY(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020014825 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014826 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014827#ifdef Py_DEBUG
14828 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
14829#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020014830 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010014831 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014832
14833onError:
14834 Py_DECREF(unicode);
14835 Py_DECREF(self);
14836 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000014837}
14838
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000014839PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070014840"str(object='') -> str\n\
14841str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000014842\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100014843Create a new string object from the given object. If encoding or\n\
14844errors is specified, then the object must expose a data buffer\n\
14845that will be decoded using the given encoding and error handler.\n\
14846Otherwise, returns the result of object.__str__() (if defined)\n\
14847or repr(object).\n\
14848encoding defaults to sys.getdefaultencoding().\n\
14849errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000014850
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014851static PyObject *unicode_iter(PyObject *seq);
14852
Guido van Rossumd57fd912000-03-10 22:53:23 +000014853PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000014854 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014855 "str", /* tp_name */
14856 sizeof(PyUnicodeObject), /* tp_size */
14857 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014858 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014859 (destructor)unicode_dealloc, /* tp_dealloc */
14860 0, /* tp_print */
14861 0, /* tp_getattr */
14862 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014863 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014864 unicode_repr, /* tp_repr */
14865 &unicode_as_number, /* tp_as_number */
14866 &unicode_as_sequence, /* tp_as_sequence */
14867 &unicode_as_mapping, /* tp_as_mapping */
14868 (hashfunc) unicode_hash, /* tp_hash*/
14869 0, /* tp_call*/
14870 (reprfunc) unicode_str, /* tp_str */
14871 PyObject_GenericGetAttr, /* tp_getattro */
14872 0, /* tp_setattro */
14873 0, /* tp_as_buffer */
14874 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000014875 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014876 unicode_doc, /* tp_doc */
14877 0, /* tp_traverse */
14878 0, /* tp_clear */
14879 PyUnicode_RichCompare, /* tp_richcompare */
14880 0, /* tp_weaklistoffset */
14881 unicode_iter, /* tp_iter */
14882 0, /* tp_iternext */
14883 unicode_methods, /* tp_methods */
14884 0, /* tp_members */
14885 0, /* tp_getset */
14886 &PyBaseObject_Type, /* tp_base */
14887 0, /* tp_dict */
14888 0, /* tp_descr_get */
14889 0, /* tp_descr_set */
14890 0, /* tp_dictoffset */
14891 0, /* tp_init */
14892 0, /* tp_alloc */
14893 unicode_new, /* tp_new */
14894 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014895};
14896
14897/* Initialize the Unicode implementation */
14898
Victor Stinner3a50e702011-10-18 21:21:00 +020014899int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014900{
Thomas Wouters477c8d52006-05-27 19:21:47 +000014901 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014902 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000014903 0x000A, /* LINE FEED */
14904 0x000D, /* CARRIAGE RETURN */
14905 0x001C, /* FILE SEPARATOR */
14906 0x001D, /* GROUP SEPARATOR */
14907 0x001E, /* RECORD SEPARATOR */
14908 0x0085, /* NEXT LINE */
14909 0x2028, /* LINE SEPARATOR */
14910 0x2029, /* PARAGRAPH SEPARATOR */
14911 };
14912
Fred Drakee4315f52000-05-09 19:53:39 +000014913 /* Init the implementation */
Serhiy Storchaka678db842013-01-26 12:16:36 +020014914 _Py_INCREF_UNICODE_EMPTY();
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014915 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014916 Py_FatalError("Can't create empty string");
Serhiy Storchaka678db842013-01-26 12:16:36 +020014917 Py_DECREF(unicode_empty);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014918
Guido van Rossumcacfc072002-05-24 19:01:59 +000014919 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014920 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000014921
14922 /* initialize the linebreak bloom filter */
14923 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014924 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020014925 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014926
Christian Heimes26532f72013-07-20 14:57:16 +020014927 if (PyType_Ready(&EncodingMapType) < 0)
14928 Py_FatalError("Can't initialize encoding map type");
Victor Stinner3a50e702011-10-18 21:21:00 +020014929
Benjamin Petersonc4311282012-10-30 23:21:10 -040014930 if (PyType_Ready(&PyFieldNameIter_Type) < 0)
14931 Py_FatalError("Can't initialize field name iterator type");
14932
14933 if (PyType_Ready(&PyFormatterIter_Type) < 0)
14934 Py_FatalError("Can't initialize formatter iter type");
Benjamin Petersone8ea97f2012-10-30 23:27:52 -040014935
Victor Stinner3a50e702011-10-18 21:21:00 +020014936#ifdef HAVE_MBCS
14937 winver.dwOSVersionInfoSize = sizeof(winver);
14938 if (!GetVersionEx((OSVERSIONINFO*)&winver)) {
14939 PyErr_SetFromWindowsErr(0);
14940 return -1;
14941 }
14942#endif
14943 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014944}
14945
14946/* Finalize the Unicode implementation */
14947
Christian Heimesa156e092008-02-16 07:38:31 +000014948int
14949PyUnicode_ClearFreeList(void)
14950{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014951 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000014952}
14953
Guido van Rossumd57fd912000-03-10 22:53:23 +000014954void
Thomas Wouters78890102000-07-22 19:25:51 +000014955_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014956{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014957 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014958
Serhiy Storchaka05997252013-01-26 12:14:02 +020014959 Py_CLEAR(unicode_empty);
Barry Warsaw5b4c2282000-10-03 20:45:26 +000014960
Serhiy Storchaka05997252013-01-26 12:14:02 +020014961 for (i = 0; i < 256; i++)
14962 Py_CLEAR(unicode_latin1[i]);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020014963 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000014964 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000014965}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000014966
Walter Dörwald16807132007-05-25 13:52:07 +000014967void
14968PyUnicode_InternInPlace(PyObject **p)
14969{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020014970 PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014971 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020014972#ifdef Py_DEBUG
14973 assert(s != NULL);
14974 assert(_PyUnicode_CHECK(s));
14975#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000014976 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020014977 return;
14978#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000014979 /* If it's a subclass, we don't really know what putting
14980 it in the interned dict might do. */
14981 if (!PyUnicode_CheckExact(s))
14982 return;
14983 if (PyUnicode_CHECK_INTERNED(s))
14984 return;
14985 if (interned == NULL) {
14986 interned = PyDict_New();
14987 if (interned == NULL) {
14988 PyErr_Clear(); /* Don't leave an exception */
14989 return;
14990 }
14991 }
14992 /* It might be that the GetItem call fails even
14993 though the key is present in the dictionary,
14994 namely when this happens during a stack overflow. */
14995 Py_ALLOW_RECURSION
Victor Stinner7931d9a2011-11-04 00:22:48 +010014996 t = PyDict_GetItem(interned, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014997 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000014998
Victor Stinnerf0335102013-04-14 19:13:03 +020014999 if (t) {
15000 Py_INCREF(t);
15001 Py_DECREF(*p);
15002 *p = t;
15003 return;
15004 }
Walter Dörwald16807132007-05-25 13:52:07 +000015005
Benjamin Peterson14339b62009-01-31 16:36:08 +000015006 PyThreadState_GET()->recursion_critical = 1;
Victor Stinner7931d9a2011-11-04 00:22:48 +010015007 if (PyDict_SetItem(interned, s, s) < 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015008 PyErr_Clear();
15009 PyThreadState_GET()->recursion_critical = 0;
15010 return;
15011 }
15012 PyThreadState_GET()->recursion_critical = 0;
15013 /* The two references in interned are not counted by refcnt.
15014 The deallocator will take care of this */
15015 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015016 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000015017}
15018
15019void
15020PyUnicode_InternImmortal(PyObject **p)
15021{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015022 PyUnicode_InternInPlace(p);
15023 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020015024 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015025 Py_INCREF(*p);
15026 }
Walter Dörwald16807132007-05-25 13:52:07 +000015027}
15028
15029PyObject *
15030PyUnicode_InternFromString(const char *cp)
15031{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015032 PyObject *s = PyUnicode_FromString(cp);
15033 if (s == NULL)
15034 return NULL;
15035 PyUnicode_InternInPlace(&s);
15036 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000015037}
15038
Alexander Belopolsky40018472011-02-26 01:02:56 +000015039void
15040_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000015041{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015042 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015043 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015044 Py_ssize_t i, n;
15045 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000015046
Benjamin Peterson14339b62009-01-31 16:36:08 +000015047 if (interned == NULL || !PyDict_Check(interned))
15048 return;
15049 keys = PyDict_Keys(interned);
15050 if (keys == NULL || !PyList_Check(keys)) {
15051 PyErr_Clear();
15052 return;
15053 }
Walter Dörwald16807132007-05-25 13:52:07 +000015054
Benjamin Peterson14339b62009-01-31 16:36:08 +000015055 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
15056 detector, interned unicode strings are not forcibly deallocated;
15057 rather, we give them their stolen references back, and then clear
15058 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000015059
Benjamin Peterson14339b62009-01-31 16:36:08 +000015060 n = PyList_GET_SIZE(keys);
15061 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000015062 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015063 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015064 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015065 if (PyUnicode_READY(s) == -1) {
15066 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015067 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015068 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015069 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015070 case SSTATE_NOT_INTERNED:
15071 /* XXX Shouldn't happen */
15072 break;
15073 case SSTATE_INTERNED_IMMORTAL:
15074 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015075 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015076 break;
15077 case SSTATE_INTERNED_MORTAL:
15078 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015079 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015080 break;
15081 default:
15082 Py_FatalError("Inconsistent interned string state.");
15083 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015084 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015085 }
15086 fprintf(stderr, "total size of all interned strings: "
15087 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
15088 "mortal/immortal\n", mortal_size, immortal_size);
15089 Py_DECREF(keys);
15090 PyDict_Clear(interned);
Serhiy Storchaka05997252013-01-26 12:14:02 +020015091 Py_CLEAR(interned);
Walter Dörwald16807132007-05-25 13:52:07 +000015092}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015093
15094
15095/********************* Unicode Iterator **************************/
15096
15097typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015098 PyObject_HEAD
15099 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015100 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015101} unicodeiterobject;
15102
15103static void
15104unicodeiter_dealloc(unicodeiterobject *it)
15105{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015106 _PyObject_GC_UNTRACK(it);
15107 Py_XDECREF(it->it_seq);
15108 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015109}
15110
15111static int
15112unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15113{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015114 Py_VISIT(it->it_seq);
15115 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015116}
15117
15118static PyObject *
15119unicodeiter_next(unicodeiterobject *it)
15120{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015121 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015122
Benjamin Peterson14339b62009-01-31 16:36:08 +000015123 assert(it != NULL);
15124 seq = it->it_seq;
15125 if (seq == NULL)
15126 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015127 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015128
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015129 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15130 int kind = PyUnicode_KIND(seq);
15131 void *data = PyUnicode_DATA(seq);
15132 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15133 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015134 if (item != NULL)
15135 ++it->it_index;
15136 return item;
15137 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015138
Benjamin Peterson14339b62009-01-31 16:36:08 +000015139 Py_DECREF(seq);
15140 it->it_seq = NULL;
15141 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015142}
15143
15144static PyObject *
15145unicodeiter_len(unicodeiterobject *it)
15146{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015147 Py_ssize_t len = 0;
15148 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020015149 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015150 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015151}
15152
15153PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15154
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015155static PyObject *
15156unicodeiter_reduce(unicodeiterobject *it)
15157{
15158 if (it->it_seq != NULL) {
Antoine Pitroua7013882012-04-05 00:04:20 +020015159 return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015160 it->it_seq, it->it_index);
15161 } else {
15162 PyObject *u = PyUnicode_FromUnicode(NULL, 0);
15163 if (u == NULL)
15164 return NULL;
Antoine Pitroua7013882012-04-05 00:04:20 +020015165 return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015166 }
15167}
15168
15169PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15170
15171static PyObject *
15172unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15173{
15174 Py_ssize_t index = PyLong_AsSsize_t(state);
15175 if (index == -1 && PyErr_Occurred())
15176 return NULL;
15177 if (index < 0)
15178 index = 0;
15179 it->it_index = index;
15180 Py_RETURN_NONE;
15181}
15182
15183PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15184
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015185static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015186 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000015187 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015188 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15189 reduce_doc},
15190 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
15191 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000015192 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015193};
15194
15195PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015196 PyVarObject_HEAD_INIT(&PyType_Type, 0)
15197 "str_iterator", /* tp_name */
15198 sizeof(unicodeiterobject), /* tp_basicsize */
15199 0, /* tp_itemsize */
15200 /* methods */
15201 (destructor)unicodeiter_dealloc, /* tp_dealloc */
15202 0, /* tp_print */
15203 0, /* tp_getattr */
15204 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000015205 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015206 0, /* tp_repr */
15207 0, /* tp_as_number */
15208 0, /* tp_as_sequence */
15209 0, /* tp_as_mapping */
15210 0, /* tp_hash */
15211 0, /* tp_call */
15212 0, /* tp_str */
15213 PyObject_GenericGetAttr, /* tp_getattro */
15214 0, /* tp_setattro */
15215 0, /* tp_as_buffer */
15216 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15217 0, /* tp_doc */
15218 (traverseproc)unicodeiter_traverse, /* tp_traverse */
15219 0, /* tp_clear */
15220 0, /* tp_richcompare */
15221 0, /* tp_weaklistoffset */
15222 PyObject_SelfIter, /* tp_iter */
15223 (iternextfunc)unicodeiter_next, /* tp_iternext */
15224 unicodeiter_methods, /* tp_methods */
15225 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015226};
15227
15228static PyObject *
15229unicode_iter(PyObject *seq)
15230{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015231 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015232
Benjamin Peterson14339b62009-01-31 16:36:08 +000015233 if (!PyUnicode_Check(seq)) {
15234 PyErr_BadInternalCall();
15235 return NULL;
15236 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015237 if (PyUnicode_READY(seq) == -1)
15238 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015239 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15240 if (it == NULL)
15241 return NULL;
15242 it->it_index = 0;
15243 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015244 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015245 _PyObject_GC_TRACK(it);
15246 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015247}
15248
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015249
15250size_t
15251Py_UNICODE_strlen(const Py_UNICODE *u)
15252{
15253 int res = 0;
15254 while(*u++)
15255 res++;
15256 return res;
15257}
15258
15259Py_UNICODE*
15260Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
15261{
15262 Py_UNICODE *u = s1;
15263 while ((*u++ = *s2++));
15264 return s1;
15265}
15266
15267Py_UNICODE*
15268Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15269{
15270 Py_UNICODE *u = s1;
15271 while ((*u++ = *s2++))
15272 if (n-- == 0)
15273 break;
15274 return s1;
15275}
15276
15277Py_UNICODE*
15278Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
15279{
15280 Py_UNICODE *u1 = s1;
15281 u1 += Py_UNICODE_strlen(u1);
15282 Py_UNICODE_strcpy(u1, s2);
15283 return s1;
15284}
15285
15286int
15287Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
15288{
15289 while (*s1 && *s2 && *s1 == *s2)
15290 s1++, s2++;
15291 if (*s1 && *s2)
15292 return (*s1 < *s2) ? -1 : +1;
15293 if (*s1)
15294 return 1;
15295 if (*s2)
15296 return -1;
15297 return 0;
15298}
15299
15300int
15301Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15302{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015303 Py_UNICODE u1, u2;
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015304 for (; n != 0; n--) {
15305 u1 = *s1;
15306 u2 = *s2;
15307 if (u1 != u2)
15308 return (u1 < u2) ? -1 : +1;
15309 if (u1 == '\0')
15310 return 0;
15311 s1++;
15312 s2++;
15313 }
15314 return 0;
15315}
15316
15317Py_UNICODE*
15318Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
15319{
15320 const Py_UNICODE *p;
15321 for (p = s; *p; p++)
15322 if (*p == c)
15323 return (Py_UNICODE*)p;
15324 return NULL;
15325}
15326
15327Py_UNICODE*
15328Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
15329{
15330 const Py_UNICODE *p;
15331 p = s + Py_UNICODE_strlen(s);
15332 while (p != s) {
15333 p--;
15334 if (*p == c)
15335 return (Py_UNICODE*)p;
15336 }
15337 return NULL;
15338}
Victor Stinner331ea922010-08-10 16:37:20 +000015339
Victor Stinner71133ff2010-09-01 23:43:53 +000015340Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015341PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000015342{
Victor Stinner577db2c2011-10-11 22:12:48 +020015343 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015344 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000015345
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015346 if (!PyUnicode_Check(unicode)) {
15347 PyErr_BadArgument();
15348 return NULL;
15349 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015350 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020015351 if (u == NULL)
15352 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000015353 /* Ensure we won't overflow the size. */
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015354 if (len > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000015355 PyErr_NoMemory();
15356 return NULL;
15357 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015358 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000015359 size *= sizeof(Py_UNICODE);
15360 copy = PyMem_Malloc(size);
15361 if (copy == NULL) {
15362 PyErr_NoMemory();
15363 return NULL;
15364 }
Victor Stinner577db2c2011-10-11 22:12:48 +020015365 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000015366 return copy;
15367}
Martin v. Löwis5b222132007-06-10 09:51:05 +000015368
Georg Brandl66c221e2010-10-14 07:04:07 +000015369/* A _string module, to export formatter_parser and formatter_field_name_split
15370 to the string.Formatter class implemented in Python. */
15371
15372static PyMethodDef _string_methods[] = {
15373 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
15374 METH_O, PyDoc_STR("split the argument as a field name")},
15375 {"formatter_parser", (PyCFunction) formatter_parser,
15376 METH_O, PyDoc_STR("parse the argument as a format string")},
15377 {NULL, NULL}
15378};
15379
15380static struct PyModuleDef _string_module = {
15381 PyModuleDef_HEAD_INIT,
15382 "_string",
15383 PyDoc_STR("string helper module"),
15384 0,
15385 _string_methods,
15386 NULL,
15387 NULL,
15388 NULL,
15389 NULL
15390};
15391
15392PyMODINIT_FUNC
15393PyInit__string(void)
15394{
15395 return PyModule_Create(&_string_module);
15396}
15397
15398
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000015399#ifdef __cplusplus
15400}
15401#endif