blob: ab261cc953f30bcc4232c23df1bc84ead300e168 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000046#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000047#include <windows.h>
48#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000049
Larry Hastings61272b72014-01-07 12:41:53 -080050/*[clinic input]
Larry Hastingsc2047262014-01-25 20:43:29 -080051class str "PyUnicodeObject *" "&PyUnicode_Type"
Larry Hastings61272b72014-01-07 12:41:53 -080052[clinic start generated code]*/
Larry Hastings581ee362014-01-28 05:00:08 -080053/*[clinic end generated code: output=da39a3ee5e6b4b0d input=604e916854800fa8]*/
Larry Hastings44e2eaa2013-11-23 15:37:55 -080054
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000055/* --- Globals ------------------------------------------------------------
56
Serhiy Storchaka05997252013-01-26 12:14:02 +020057NOTE: In the interpreter's initialization phase, some globals are currently
58 initialized dynamically as needed. In the process Unicode objects may
59 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000060
61*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000062
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000063
64#ifdef __cplusplus
65extern "C" {
66#endif
67
Victor Stinner8faf8212011-12-08 22:14:11 +010068/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
69#define MAX_UNICODE 0x10ffff
70
Victor Stinner910337b2011-10-03 03:20:16 +020071#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020072# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020073#else
74# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
75#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020076
Victor Stinnere90fe6a2011-10-01 16:48:13 +020077#define _PyUnicode_UTF8(op) \
78 (((PyCompactUnicodeObject*)(op))->utf8)
79#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020080 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020081 assert(PyUnicode_IS_READY(op)), \
82 PyUnicode_IS_COMPACT_ASCII(op) ? \
83 ((char*)((PyASCIIObject*)(op) + 1)) : \
84 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +020085#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020086 (((PyCompactUnicodeObject*)(op))->utf8_length)
87#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020088 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020089 assert(PyUnicode_IS_READY(op)), \
90 PyUnicode_IS_COMPACT_ASCII(op) ? \
91 ((PyASCIIObject*)(op))->length : \
92 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +020093#define _PyUnicode_WSTR(op) \
94 (((PyASCIIObject*)(op))->wstr)
95#define _PyUnicode_WSTR_LENGTH(op) \
96 (((PyCompactUnicodeObject*)(op))->wstr_length)
97#define _PyUnicode_LENGTH(op) \
98 (((PyASCIIObject *)(op))->length)
99#define _PyUnicode_STATE(op) \
100 (((PyASCIIObject *)(op))->state)
101#define _PyUnicode_HASH(op) \
102 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200103#define _PyUnicode_KIND(op) \
104 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200105 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200106#define _PyUnicode_GET_LENGTH(op) \
107 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200108 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200109#define _PyUnicode_DATA_ANY(op) \
110 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200111
Victor Stinner910337b2011-10-03 03:20:16 +0200112#undef PyUnicode_READY
113#define PyUnicode_READY(op) \
114 (assert(_PyUnicode_CHECK(op)), \
115 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200116 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100117 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200118
Victor Stinnerc379ead2011-10-03 12:52:27 +0200119#define _PyUnicode_SHARE_UTF8(op) \
120 (assert(_PyUnicode_CHECK(op)), \
121 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
122 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
123#define _PyUnicode_SHARE_WSTR(op) \
124 (assert(_PyUnicode_CHECK(op)), \
125 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
126
Victor Stinner829c0ad2011-10-03 01:08:02 +0200127/* true if the Unicode object has an allocated UTF-8 memory block
128 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200129#define _PyUnicode_HAS_UTF8_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200130 ((!PyUnicode_IS_COMPACT_ASCII(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200131 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200132 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
133
Victor Stinner03490912011-10-03 23:45:12 +0200134/* true if the Unicode object has an allocated wstr memory block
135 (not shared with other data) */
136#define _PyUnicode_HAS_WSTR_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200137 ((_PyUnicode_WSTR(op) && \
Victor Stinner03490912011-10-03 23:45:12 +0200138 (!PyUnicode_IS_READY(op) || \
139 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
140
Victor Stinner910337b2011-10-03 03:20:16 +0200141/* Generic helper macro to convert characters of different types.
142 from_type and to_type have to be valid type names, begin and end
143 are pointers to the source characters which should be of type
144 "from_type *". to is a pointer of type "to_type *" and points to the
145 buffer where the result characters are written to. */
146#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
147 do { \
Victor Stinner4a587072013-11-19 12:54:53 +0100148 to_type *_to = (to_type *)(to); \
149 const from_type *_iter = (from_type *)(begin); \
150 const from_type *_end = (from_type *)(end); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200151 Py_ssize_t n = (_end) - (_iter); \
152 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200153 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200154 while (_iter < (_unrolled_end)) { \
155 _to[0] = (to_type) _iter[0]; \
156 _to[1] = (to_type) _iter[1]; \
157 _to[2] = (to_type) _iter[2]; \
158 _to[3] = (to_type) _iter[3]; \
159 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200160 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200161 while (_iter < (_end)) \
162 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200163 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200164
Walter Dörwald16807132007-05-25 13:52:07 +0000165/* This dictionary holds all interned unicode strings. Note that references
166 to strings in this dictionary are *not* counted in the string's ob_refcnt.
167 When the interned string reaches a refcnt of 0 the string deallocation
168 function will delete the reference from this dictionary.
169
170 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000171 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000172*/
Serhiy Storchaka05997252013-01-26 12:14:02 +0200173static PyObject *interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +0000174
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000175/* The empty Unicode object is shared to improve performance. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200176static PyObject *unicode_empty = NULL;
Serhiy Storchaka05997252013-01-26 12:14:02 +0200177
Serhiy Storchaka678db842013-01-26 12:16:36 +0200178#define _Py_INCREF_UNICODE_EMPTY() \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200179 do { \
180 if (unicode_empty != NULL) \
181 Py_INCREF(unicode_empty); \
182 else { \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200183 unicode_empty = PyUnicode_New(0, 0); \
184 if (unicode_empty != NULL) { \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200185 Py_INCREF(unicode_empty); \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200186 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
187 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200188 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200189 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000190
Serhiy Storchaka678db842013-01-26 12:16:36 +0200191#define _Py_RETURN_UNICODE_EMPTY() \
192 do { \
193 _Py_INCREF_UNICODE_EMPTY(); \
194 return unicode_empty; \
195 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000196
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200197/* Forward declaration */
198Py_LOCAL_INLINE(int)
199_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
200
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200201/* List of static strings. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200202static _Py_Identifier *static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200203
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000204/* Single character Unicode strings in the Latin-1 range are being
205 shared as well. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200206static PyObject *unicode_latin1[256] = {NULL};
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000207
Christian Heimes190d79e2008-01-30 11:58:22 +0000208/* Fast detection of the most frequent whitespace characters */
209const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000210 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000211/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000212/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000213/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000214/* case 0x000C: * FORM FEED */
215/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000216 0, 1, 1, 1, 1, 1, 0, 0,
217 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000218/* case 0x001C: * FILE SEPARATOR */
219/* case 0x001D: * GROUP SEPARATOR */
220/* case 0x001E: * RECORD SEPARATOR */
221/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000222 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000223/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000224 1, 0, 0, 0, 0, 0, 0, 0,
225 0, 0, 0, 0, 0, 0, 0, 0,
226 0, 0, 0, 0, 0, 0, 0, 0,
227 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000228
Benjamin Peterson14339b62009-01-31 16:36:08 +0000229 0, 0, 0, 0, 0, 0, 0, 0,
230 0, 0, 0, 0, 0, 0, 0, 0,
231 0, 0, 0, 0, 0, 0, 0, 0,
232 0, 0, 0, 0, 0, 0, 0, 0,
233 0, 0, 0, 0, 0, 0, 0, 0,
234 0, 0, 0, 0, 0, 0, 0, 0,
235 0, 0, 0, 0, 0, 0, 0, 0,
236 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000237};
238
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200239/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200240static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200241static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100242static int unicode_modifiable(PyObject *unicode);
243
Victor Stinnerfe226c02011-10-03 03:52:20 +0200244
Alexander Belopolsky40018472011-02-26 01:02:56 +0000245static PyObject *
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100246_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200247static PyObject *
248_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
249static PyObject *
250_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
251
252static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000253unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000254 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100255 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000256 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
257
Alexander Belopolsky40018472011-02-26 01:02:56 +0000258static void
259raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300260 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100261 PyObject *unicode,
262 Py_ssize_t startpos, Py_ssize_t endpos,
263 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000264
Christian Heimes190d79e2008-01-30 11:58:22 +0000265/* Same for linebreaks */
266static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000267 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000268/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000269/* 0x000B, * LINE TABULATION */
270/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000271/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000272 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000273 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000274/* 0x001C, * FILE SEPARATOR */
275/* 0x001D, * GROUP SEPARATOR */
276/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000277 0, 0, 0, 0, 1, 1, 1, 0,
278 0, 0, 0, 0, 0, 0, 0, 0,
279 0, 0, 0, 0, 0, 0, 0, 0,
280 0, 0, 0, 0, 0, 0, 0, 0,
281 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000282
Benjamin Peterson14339b62009-01-31 16:36:08 +0000283 0, 0, 0, 0, 0, 0, 0, 0,
284 0, 0, 0, 0, 0, 0, 0, 0,
285 0, 0, 0, 0, 0, 0, 0, 0,
286 0, 0, 0, 0, 0, 0, 0, 0,
287 0, 0, 0, 0, 0, 0, 0, 0,
288 0, 0, 0, 0, 0, 0, 0, 0,
289 0, 0, 0, 0, 0, 0, 0, 0,
290 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000291};
292
Serhiy Storchaka1009bf12015-04-03 23:53:51 +0300293#include "clinic/unicodeobject.c.h"
294
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300295/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
296 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000297Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000298PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000299{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000300#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000301 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000302#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000303 /* This is actually an illegal character, so it should
304 not be passed to unichr. */
305 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000306#endif
307}
308
Victor Stinner910337b2011-10-03 03:20:16 +0200309#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200310int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100311_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200312{
313 PyASCIIObject *ascii;
314 unsigned int kind;
315
316 assert(PyUnicode_Check(op));
317
318 ascii = (PyASCIIObject *)op;
319 kind = ascii->state.kind;
320
Victor Stinnera3b334d2011-10-03 13:53:37 +0200321 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200322 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200323 assert(ascii->state.ready == 1);
324 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200325 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200326 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200327 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200328
Victor Stinnera41463c2011-10-04 01:05:08 +0200329 if (ascii->state.compact == 1) {
330 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200331 assert(kind == PyUnicode_1BYTE_KIND
332 || kind == PyUnicode_2BYTE_KIND
333 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200334 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200335 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200336 assert (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100337 }
338 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200339 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
340
341 data = unicode->data.any;
342 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnere30c0a12011-11-04 20:54:05 +0100343 assert(ascii->length == 0);
344 assert(ascii->hash == -1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200345 assert(ascii->state.compact == 0);
346 assert(ascii->state.ascii == 0);
347 assert(ascii->state.ready == 0);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100348 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
Victor Stinnera41463c2011-10-04 01:05:08 +0200349 assert(ascii->wstr != NULL);
350 assert(data == NULL);
351 assert(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200352 }
353 else {
354 assert(kind == PyUnicode_1BYTE_KIND
355 || kind == PyUnicode_2BYTE_KIND
356 || kind == PyUnicode_4BYTE_KIND);
357 assert(ascii->state.compact == 0);
358 assert(ascii->state.ready == 1);
359 assert(data != NULL);
360 if (ascii->state.ascii) {
361 assert (compact->utf8 == data);
362 assert (compact->utf8_length == ascii->length);
363 }
364 else
365 assert (compact->utf8 != data);
366 }
367 }
368 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200369 if (
370#if SIZEOF_WCHAR_T == 2
371 kind == PyUnicode_2BYTE_KIND
372#else
373 kind == PyUnicode_4BYTE_KIND
374#endif
375 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200376 {
377 assert(ascii->wstr == data);
378 assert(compact->wstr_length == ascii->length);
379 } else
380 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200381 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200382
383 if (compact->utf8 == NULL)
384 assert(compact->utf8_length == 0);
385 if (ascii->wstr == NULL)
386 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200387 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200388 /* check that the best kind is used */
389 if (check_content && kind != PyUnicode_WCHAR_KIND)
390 {
391 Py_ssize_t i;
392 Py_UCS4 maxchar = 0;
Victor Stinner718fbf02012-04-26 00:39:37 +0200393 void *data;
394 Py_UCS4 ch;
395
396 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200397 for (i=0; i < ascii->length; i++)
398 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200399 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200400 if (ch > maxchar)
401 maxchar = ch;
402 }
403 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100404 if (ascii->state.ascii == 0) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200405 assert(maxchar >= 128);
Victor Stinner77faf692011-11-20 18:56:05 +0100406 assert(maxchar <= 255);
407 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200408 else
409 assert(maxchar < 128);
410 }
Victor Stinner77faf692011-11-20 18:56:05 +0100411 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200412 assert(maxchar >= 0x100);
Victor Stinner77faf692011-11-20 18:56:05 +0100413 assert(maxchar <= 0xFFFF);
414 }
415 else {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200416 assert(maxchar >= 0x10000);
Victor Stinner8faf8212011-12-08 22:14:11 +0100417 assert(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100418 }
Victor Stinner718fbf02012-04-26 00:39:37 +0200419 assert(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200420 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400421 return 1;
422}
Victor Stinner910337b2011-10-03 03:20:16 +0200423#endif
424
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100425static PyObject*
426unicode_result_wchar(PyObject *unicode)
427{
428#ifndef Py_DEBUG
429 Py_ssize_t len;
430
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100431 len = _PyUnicode_WSTR_LENGTH(unicode);
432 if (len == 0) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100433 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200434 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100435 }
436
437 if (len == 1) {
438 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100439 if ((Py_UCS4)ch < 256) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100440 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
441 Py_DECREF(unicode);
442 return latin1_char;
443 }
444 }
445
446 if (_PyUnicode_Ready(unicode) < 0) {
Victor Stinneraa771272012-10-04 02:32:58 +0200447 Py_DECREF(unicode);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100448 return NULL;
449 }
450#else
Victor Stinneraa771272012-10-04 02:32:58 +0200451 assert(Py_REFCNT(unicode) == 1);
452
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100453 /* don't make the result ready in debug mode to ensure that the caller
454 makes the string ready before using it */
455 assert(_PyUnicode_CheckConsistency(unicode, 1));
456#endif
457 return unicode;
458}
459
460static PyObject*
461unicode_result_ready(PyObject *unicode)
462{
463 Py_ssize_t length;
464
465 length = PyUnicode_GET_LENGTH(unicode);
466 if (length == 0) {
467 if (unicode != unicode_empty) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100468 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200469 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100470 }
471 return unicode_empty;
472 }
473
474 if (length == 1) {
Victor Stinner69ed0f42013-04-09 21:48:24 +0200475 void *data = PyUnicode_DATA(unicode);
476 int kind = PyUnicode_KIND(unicode);
477 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100478 if (ch < 256) {
479 PyObject *latin1_char = unicode_latin1[ch];
480 if (latin1_char != NULL) {
481 if (unicode != latin1_char) {
482 Py_INCREF(latin1_char);
483 Py_DECREF(unicode);
484 }
485 return latin1_char;
486 }
487 else {
488 assert(_PyUnicode_CheckConsistency(unicode, 1));
489 Py_INCREF(unicode);
490 unicode_latin1[ch] = unicode;
491 return unicode;
492 }
493 }
494 }
495
496 assert(_PyUnicode_CheckConsistency(unicode, 1));
497 return unicode;
498}
499
500static PyObject*
501unicode_result(PyObject *unicode)
502{
503 assert(_PyUnicode_CHECK(unicode));
504 if (PyUnicode_IS_READY(unicode))
505 return unicode_result_ready(unicode);
506 else
507 return unicode_result_wchar(unicode);
508}
509
Victor Stinnerc4b49542011-12-11 22:44:26 +0100510static PyObject*
511unicode_result_unchanged(PyObject *unicode)
512{
513 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500514 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100515 return NULL;
516 Py_INCREF(unicode);
517 return unicode;
518 }
519 else
520 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100521 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100522}
523
Thomas Wouters477c8d52006-05-27 19:21:47 +0000524/* --- Bloom Filters ----------------------------------------------------- */
525
526/* stuff to implement simple "bloom filters" for Unicode characters.
527 to keep things simple, we use a single bitmask, using the least 5
528 bits from each unicode characters as the bit index. */
529
530/* the linebreak mask is set up by Unicode_Init below */
531
Antoine Pitrouf068f942010-01-13 14:19:12 +0000532#if LONG_BIT >= 128
533#define BLOOM_WIDTH 128
534#elif LONG_BIT >= 64
535#define BLOOM_WIDTH 64
536#elif LONG_BIT >= 32
537#define BLOOM_WIDTH 32
538#else
539#error "LONG_BIT is smaller than 32"
540#endif
541
Thomas Wouters477c8d52006-05-27 19:21:47 +0000542#define BLOOM_MASK unsigned long
543
Serhiy Storchaka05997252013-01-26 12:14:02 +0200544static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000545
Antoine Pitrouf068f942010-01-13 14:19:12 +0000546#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000547
Benjamin Peterson29060642009-01-31 22:14:21 +0000548#define BLOOM_LINEBREAK(ch) \
549 ((ch) < 128U ? ascii_linebreak[(ch)] : \
550 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000551
Alexander Belopolsky40018472011-02-26 01:02:56 +0000552Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200553make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000554{
Victor Stinnera85af502013-04-09 21:53:54 +0200555#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
556 do { \
557 TYPE *data = (TYPE *)PTR; \
558 TYPE *end = data + LEN; \
559 Py_UCS4 ch; \
560 for (; data != end; data++) { \
561 ch = *data; \
562 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
563 } \
564 break; \
565 } while (0)
566
Thomas Wouters477c8d52006-05-27 19:21:47 +0000567 /* calculate simple bloom-style bitmask for a given unicode string */
568
Antoine Pitrouf068f942010-01-13 14:19:12 +0000569 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000570
571 mask = 0;
Victor Stinnera85af502013-04-09 21:53:54 +0200572 switch (kind) {
573 case PyUnicode_1BYTE_KIND:
574 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
575 break;
576 case PyUnicode_2BYTE_KIND:
577 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
578 break;
579 case PyUnicode_4BYTE_KIND:
580 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
581 break;
582 default:
583 assert(0);
584 }
Thomas Wouters477c8d52006-05-27 19:21:47 +0000585 return mask;
Victor Stinnera85af502013-04-09 21:53:54 +0200586
587#undef BLOOM_UPDATE
Thomas Wouters477c8d52006-05-27 19:21:47 +0000588}
589
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200590/* Compilation of templated routines */
591
592#include "stringlib/asciilib.h"
593#include "stringlib/fastsearch.h"
594#include "stringlib/partition.h"
595#include "stringlib/split.h"
596#include "stringlib/count.h"
597#include "stringlib/find.h"
598#include "stringlib/find_max_char.h"
599#include "stringlib/localeutil.h"
600#include "stringlib/undef.h"
601
602#include "stringlib/ucs1lib.h"
603#include "stringlib/fastsearch.h"
604#include "stringlib/partition.h"
605#include "stringlib/split.h"
606#include "stringlib/count.h"
607#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300608#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200609#include "stringlib/find_max_char.h"
610#include "stringlib/localeutil.h"
611#include "stringlib/undef.h"
612
613#include "stringlib/ucs2lib.h"
614#include "stringlib/fastsearch.h"
615#include "stringlib/partition.h"
616#include "stringlib/split.h"
617#include "stringlib/count.h"
618#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300619#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200620#include "stringlib/find_max_char.h"
621#include "stringlib/localeutil.h"
622#include "stringlib/undef.h"
623
624#include "stringlib/ucs4lib.h"
625#include "stringlib/fastsearch.h"
626#include "stringlib/partition.h"
627#include "stringlib/split.h"
628#include "stringlib/count.h"
629#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300630#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200631#include "stringlib/find_max_char.h"
632#include "stringlib/localeutil.h"
633#include "stringlib/undef.h"
634
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200635#include "stringlib/unicodedefs.h"
636#include "stringlib/fastsearch.h"
637#include "stringlib/count.h"
638#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100639#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200640
Guido van Rossumd57fd912000-03-10 22:53:23 +0000641/* --- Unicode Object ----------------------------------------------------- */
642
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200643static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200644fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200645
Serhiy Storchakad9d769f2015-03-24 21:55:47 +0200646Py_LOCAL_INLINE(Py_ssize_t) findchar(const void *s, int kind,
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200647 Py_ssize_t size, Py_UCS4 ch,
648 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200649{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200650 int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH;
651
652 switch (kind) {
653 case PyUnicode_1BYTE_KIND:
654 {
655 Py_UCS1 ch1 = (Py_UCS1) ch;
656 if (ch1 == ch)
657 return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode);
658 else
659 return -1;
660 }
661 case PyUnicode_2BYTE_KIND:
662 {
663 Py_UCS2 ch2 = (Py_UCS2) ch;
664 if (ch2 == ch)
665 return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode);
666 else
667 return -1;
668 }
669 case PyUnicode_4BYTE_KIND:
670 return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode);
671 default:
672 assert(0);
673 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200674 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200675}
676
Victor Stinnerafffce42012-10-03 23:03:17 +0200677#ifdef Py_DEBUG
Martin Panter6245cb32016-04-15 02:14:19 +0000678/* Fill the data of a Unicode string with invalid characters to detect bugs
Victor Stinnerafffce42012-10-03 23:03:17 +0200679 earlier.
680
681 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
682 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
683 invalid character in Unicode 6.0. */
684static void
685unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
686{
687 int kind = PyUnicode_KIND(unicode);
688 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
689 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
690 if (length <= old_length)
691 return;
692 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
693}
694#endif
695
Victor Stinnerfe226c02011-10-03 03:52:20 +0200696static PyObject*
697resize_compact(PyObject *unicode, Py_ssize_t length)
698{
699 Py_ssize_t char_size;
700 Py_ssize_t struct_size;
701 Py_ssize_t new_size;
702 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100703 PyObject *new_unicode;
Victor Stinnerafffce42012-10-03 23:03:17 +0200704#ifdef Py_DEBUG
705 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
706#endif
707
Victor Stinner79891572012-05-03 13:43:07 +0200708 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200709 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +0100710 assert(PyUnicode_IS_COMPACT(unicode));
711
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200712 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100713 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +0200714 struct_size = sizeof(PyASCIIObject);
715 else
716 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200717 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200718
Victor Stinnerfe226c02011-10-03 03:52:20 +0200719 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
720 PyErr_NoMemory();
721 return NULL;
722 }
723 new_size = (struct_size + (length + 1) * char_size);
724
Serhiy Storchaka7aa69082015-12-03 01:02:03 +0200725 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
726 PyObject_DEL(_PyUnicode_UTF8(unicode));
727 _PyUnicode_UTF8(unicode) = NULL;
728 _PyUnicode_UTF8_LENGTH(unicode) = 0;
729 }
Victor Stinner84def372011-12-11 20:04:56 +0100730 _Py_DEC_REFTOTAL;
731 _Py_ForgetReference(unicode);
732
Serhiy Storchaka20b39b22014-09-28 11:27:24 +0300733 new_unicode = (PyObject *)PyObject_REALLOC(unicode, new_size);
Victor Stinner84def372011-12-11 20:04:56 +0100734 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100735 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200736 PyErr_NoMemory();
737 return NULL;
738 }
Victor Stinner84def372011-12-11 20:04:56 +0100739 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200740 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +0100741
Victor Stinnerfe226c02011-10-03 03:52:20 +0200742 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200743 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200744 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100745 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +0200746 _PyUnicode_WSTR_LENGTH(unicode) = length;
747 }
Victor Stinnerbbbac2e2013-02-07 23:12:46 +0100748 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
749 PyObject_DEL(_PyUnicode_WSTR(unicode));
750 _PyUnicode_WSTR(unicode) = NULL;
Victor Stinner5bc03a62016-01-27 16:56:53 +0100751 if (!PyUnicode_IS_ASCII(unicode))
752 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Victor Stinnerbbbac2e2013-02-07 23:12:46 +0100753 }
Victor Stinnerafffce42012-10-03 23:03:17 +0200754#ifdef Py_DEBUG
755 unicode_fill_invalid(unicode, old_length);
756#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200757 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
758 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +0200759 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200760 return unicode;
761}
762
Alexander Belopolsky40018472011-02-26 01:02:56 +0000763static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200764resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000765{
Victor Stinner95663112011-10-04 01:03:50 +0200766 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100767 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200768 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200769 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000770
Victor Stinnerfe226c02011-10-03 03:52:20 +0200771 if (PyUnicode_IS_READY(unicode)) {
772 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200773 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200774 void *data;
Victor Stinnerafffce42012-10-03 23:03:17 +0200775#ifdef Py_DEBUG
776 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
777#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200778
779 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200780 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200781 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
782 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200783
784 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
785 PyErr_NoMemory();
786 return -1;
787 }
788 new_size = (length + 1) * char_size;
789
Victor Stinner7a9105a2011-12-12 00:13:42 +0100790 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
791 {
792 PyObject_DEL(_PyUnicode_UTF8(unicode));
793 _PyUnicode_UTF8(unicode) = NULL;
794 _PyUnicode_UTF8_LENGTH(unicode) = 0;
795 }
796
Victor Stinnerfe226c02011-10-03 03:52:20 +0200797 data = (PyObject *)PyObject_REALLOC(data, new_size);
798 if (data == NULL) {
799 PyErr_NoMemory();
800 return -1;
801 }
802 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200803 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200804 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200805 _PyUnicode_WSTR_LENGTH(unicode) = length;
806 }
807 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200808 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200809 _PyUnicode_UTF8_LENGTH(unicode) = length;
810 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200811 _PyUnicode_LENGTH(unicode) = length;
812 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinnerafffce42012-10-03 23:03:17 +0200813#ifdef Py_DEBUG
814 unicode_fill_invalid(unicode, old_length);
815#endif
Victor Stinner95663112011-10-04 01:03:50 +0200816 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200817 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200818 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200819 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200820 }
Victor Stinner95663112011-10-04 01:03:50 +0200821 assert(_PyUnicode_WSTR(unicode) != NULL);
822
823 /* check for integer overflow */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -0700824 if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
Victor Stinner95663112011-10-04 01:03:50 +0200825 PyErr_NoMemory();
826 return -1;
827 }
Victor Stinner7a9105a2011-12-12 00:13:42 +0100828 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +0200829 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +0100830 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +0200831 if (!wstr) {
832 PyErr_NoMemory();
833 return -1;
834 }
835 _PyUnicode_WSTR(unicode) = wstr;
836 _PyUnicode_WSTR(unicode)[length] = 0;
837 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200838 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000839 return 0;
840}
841
Victor Stinnerfe226c02011-10-03 03:52:20 +0200842static PyObject*
843resize_copy(PyObject *unicode, Py_ssize_t length)
844{
845 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100846 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200847 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100848
Benjamin Petersonbac79492012-01-14 13:34:47 -0500849 if (PyUnicode_READY(unicode) == -1)
Victor Stinner7a9105a2011-12-12 00:13:42 +0100850 return NULL;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200851
852 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
853 if (copy == NULL)
854 return NULL;
855
856 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +0200857 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200858 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200859 }
860 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200861 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100862
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200863 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200864 if (w == NULL)
865 return NULL;
866 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
867 copy_length = Py_MIN(copy_length, length);
Victor Stinnerc6cf1ba2012-10-23 02:54:47 +0200868 Py_MEMCPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
869 copy_length * sizeof(wchar_t));
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200870 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200871 }
872}
873
Guido van Rossumd57fd912000-03-10 22:53:23 +0000874/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000875 Ux0000 terminated; some code (e.g. new_identifier)
876 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000877
878 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000879 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000880
881*/
882
Alexander Belopolsky40018472011-02-26 01:02:56 +0000883static PyUnicodeObject *
884_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000885{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +0200886 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200887 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000888
Thomas Wouters477c8d52006-05-27 19:21:47 +0000889 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000890 if (length == 0 && unicode_empty != NULL) {
891 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200892 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000893 }
894
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000895 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -0700896 if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000897 return (PyUnicodeObject *)PyErr_NoMemory();
898 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200899 if (length < 0) {
900 PyErr_SetString(PyExc_SystemError,
901 "Negative size passed to _PyUnicode_New");
902 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000903 }
904
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200905 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
906 if (unicode == NULL)
907 return NULL;
908 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
Victor Stinner68b674c2013-10-29 19:31:43 +0100909
910 _PyUnicode_WSTR_LENGTH(unicode) = length;
911 _PyUnicode_HASH(unicode) = -1;
912 _PyUnicode_STATE(unicode).interned = 0;
913 _PyUnicode_STATE(unicode).kind = 0;
914 _PyUnicode_STATE(unicode).compact = 0;
915 _PyUnicode_STATE(unicode).ready = 0;
916 _PyUnicode_STATE(unicode).ascii = 0;
917 _PyUnicode_DATA_ANY(unicode) = NULL;
918 _PyUnicode_LENGTH(unicode) = 0;
919 _PyUnicode_UTF8(unicode) = NULL;
920 _PyUnicode_UTF8_LENGTH(unicode) = 0;
921
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200922 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
923 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100924 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +0000925 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100926 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000927 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200928
Jeremy Hyltond8082792003-09-16 19:41:39 +0000929 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000930 * the caller fails before initializing str -- unicode_resize()
931 * reads str[0], and the Keep-Alive optimization can keep memory
932 * allocated for str alive across a call to unicode_dealloc(unicode).
933 * We don't want unicode_resize to read uninitialized memory in
934 * that case.
935 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200936 _PyUnicode_WSTR(unicode)[0] = 0;
937 _PyUnicode_WSTR(unicode)[length] = 0;
Victor Stinner68b674c2013-10-29 19:31:43 +0100938
Victor Stinner7931d9a2011-11-04 00:22:48 +0100939 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000940 return unicode;
941}
942
Victor Stinnerf42dc442011-10-02 23:33:16 +0200943static const char*
944unicode_kind_name(PyObject *unicode)
945{
Victor Stinner42dfd712011-10-03 14:41:45 +0200946 /* don't check consistency: unicode_kind_name() is called from
947 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +0200948 if (!PyUnicode_IS_COMPACT(unicode))
949 {
950 if (!PyUnicode_IS_READY(unicode))
951 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -0600952 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200953 {
954 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200955 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200956 return "legacy ascii";
957 else
958 return "legacy latin1";
959 case PyUnicode_2BYTE_KIND:
960 return "legacy UCS2";
961 case PyUnicode_4BYTE_KIND:
962 return "legacy UCS4";
963 default:
964 return "<legacy invalid kind>";
965 }
966 }
967 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -0600968 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +0200969 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200970 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200971 return "ascii";
972 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200973 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200974 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200975 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200976 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200977 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200978 default:
979 return "<invalid compact kind>";
980 }
981}
982
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200983#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200984/* Functions wrapping macros for use in debugger */
985char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200986 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200987}
988
989void *_PyUnicode_compact_data(void *unicode) {
990 return _PyUnicode_COMPACT_DATA(unicode);
991}
992void *_PyUnicode_data(void *unicode){
993 printf("obj %p\n", unicode);
994 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
995 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
996 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
997 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
998 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
999 return PyUnicode_DATA(unicode);
1000}
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001001
1002void
1003_PyUnicode_Dump(PyObject *op)
1004{
1005 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +02001006 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1007 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
1008 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +02001009
Victor Stinnera849a4b2011-10-03 12:12:11 +02001010 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +02001011 {
1012 if (ascii->state.ascii)
1013 data = (ascii + 1);
1014 else
1015 data = (compact + 1);
1016 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001017 else
1018 data = unicode->data.any;
Victor Stinner293f3f52014-07-01 08:57:10 +02001019 printf("%s: len=%" PY_FORMAT_SIZE_T "u, ",
1020 unicode_kind_name(op), ascii->length);
Victor Stinner0d60e872011-10-23 19:47:19 +02001021
Victor Stinnera849a4b2011-10-03 12:12:11 +02001022 if (ascii->wstr == data)
1023 printf("shared ");
1024 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +02001025
Victor Stinnera3b334d2011-10-03 13:53:37 +02001026 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinner293f3f52014-07-01 08:57:10 +02001027 printf(" (%" PY_FORMAT_SIZE_T "u), ", compact->wstr_length);
Victor Stinnera849a4b2011-10-03 12:12:11 +02001028 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
1029 printf("shared ");
Victor Stinner293f3f52014-07-01 08:57:10 +02001030 printf("utf8=%p (%" PY_FORMAT_SIZE_T "u)",
1031 compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001032 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001033 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001034}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001035#endif
1036
1037PyObject *
1038PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1039{
1040 PyObject *obj;
1041 PyCompactUnicodeObject *unicode;
1042 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +02001043 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001044 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001045 Py_ssize_t char_size;
1046 Py_ssize_t struct_size;
1047
1048 /* Optimization for empty strings */
1049 if (size == 0 && unicode_empty != NULL) {
1050 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001051 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001052 }
1053
Victor Stinner9e9d6892011-10-04 01:02:02 +02001054 is_ascii = 0;
1055 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001056 struct_size = sizeof(PyCompactUnicodeObject);
1057 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +02001058 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001059 char_size = 1;
1060 is_ascii = 1;
1061 struct_size = sizeof(PyASCIIObject);
1062 }
1063 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001064 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001065 char_size = 1;
1066 }
1067 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001068 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001069 char_size = 2;
1070 if (sizeof(wchar_t) == 2)
1071 is_sharing = 1;
1072 }
1073 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001074 if (maxchar > MAX_UNICODE) {
1075 PyErr_SetString(PyExc_SystemError,
1076 "invalid maximum character passed to PyUnicode_New");
1077 return NULL;
1078 }
Victor Stinner8f825062012-04-27 13:55:39 +02001079 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001080 char_size = 4;
1081 if (sizeof(wchar_t) == 4)
1082 is_sharing = 1;
1083 }
1084
1085 /* Ensure we won't overflow the size. */
1086 if (size < 0) {
1087 PyErr_SetString(PyExc_SystemError,
1088 "Negative size passed to PyUnicode_New");
1089 return NULL;
1090 }
1091 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1092 return PyErr_NoMemory();
1093
1094 /* Duplicated allocation code from _PyObject_New() instead of a call to
1095 * PyObject_New() so we are able to allocate space for the object and
1096 * it's data buffer.
1097 */
1098 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1099 if (obj == NULL)
1100 return PyErr_NoMemory();
1101 obj = PyObject_INIT(obj, &PyUnicode_Type);
1102 if (obj == NULL)
1103 return NULL;
1104
1105 unicode = (PyCompactUnicodeObject *)obj;
1106 if (is_ascii)
1107 data = ((PyASCIIObject*)obj) + 1;
1108 else
1109 data = unicode + 1;
1110 _PyUnicode_LENGTH(unicode) = size;
1111 _PyUnicode_HASH(unicode) = -1;
1112 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001113 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001114 _PyUnicode_STATE(unicode).compact = 1;
1115 _PyUnicode_STATE(unicode).ready = 1;
1116 _PyUnicode_STATE(unicode).ascii = is_ascii;
1117 if (is_ascii) {
1118 ((char*)data)[size] = 0;
1119 _PyUnicode_WSTR(unicode) = NULL;
1120 }
Victor Stinner8f825062012-04-27 13:55:39 +02001121 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001122 ((char*)data)[size] = 0;
1123 _PyUnicode_WSTR(unicode) = NULL;
1124 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001125 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001126 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001127 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001128 else {
1129 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001130 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001131 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001132 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001133 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001134 ((Py_UCS4*)data)[size] = 0;
1135 if (is_sharing) {
1136 _PyUnicode_WSTR_LENGTH(unicode) = size;
1137 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1138 }
1139 else {
1140 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1141 _PyUnicode_WSTR(unicode) = NULL;
1142 }
1143 }
Victor Stinner8f825062012-04-27 13:55:39 +02001144#ifdef Py_DEBUG
Victor Stinnerafffce42012-10-03 23:03:17 +02001145 unicode_fill_invalid((PyObject*)unicode, 0);
Victor Stinner8f825062012-04-27 13:55:39 +02001146#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001147 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001148 return obj;
1149}
1150
1151#if SIZEOF_WCHAR_T == 2
1152/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1153 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001154 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001155
1156 This function assumes that unicode can hold one more code point than wstr
1157 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001158static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001159unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001160 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001161{
1162 const wchar_t *iter;
1163 Py_UCS4 *ucs4_out;
1164
Victor Stinner910337b2011-10-03 03:20:16 +02001165 assert(unicode != NULL);
1166 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001167 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1168 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1169
1170 for (iter = begin; iter < end; ) {
1171 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1172 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001173 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1174 && (iter+1) < end
1175 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001176 {
Victor Stinner551ac952011-11-29 22:58:13 +01001177 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001178 iter += 2;
1179 }
1180 else {
1181 *ucs4_out++ = *iter;
1182 iter++;
1183 }
1184 }
1185 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1186 _PyUnicode_GET_LENGTH(unicode)));
1187
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001188}
1189#endif
1190
Victor Stinnercd9950f2011-10-02 00:34:53 +02001191static int
Victor Stinner488fa492011-12-12 00:01:39 +01001192unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001193{
Victor Stinner488fa492011-12-12 00:01:39 +01001194 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001195 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001196 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001197 return -1;
1198 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001199 return 0;
1200}
1201
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001202static int
1203_copy_characters(PyObject *to, Py_ssize_t to_start,
1204 PyObject *from, Py_ssize_t from_start,
1205 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001206{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001207 unsigned int from_kind, to_kind;
1208 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001209
Victor Stinneree4544c2012-05-09 22:24:08 +02001210 assert(0 <= how_many);
1211 assert(0 <= from_start);
1212 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001213 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001214 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001215 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001216
Victor Stinnerd3f08822012-05-29 12:57:52 +02001217 assert(PyUnicode_Check(to));
1218 assert(PyUnicode_IS_READY(to));
1219 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1220
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001221 if (how_many == 0)
1222 return 0;
1223
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001224 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001225 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001226 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001227 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001228
Victor Stinnerf1852262012-06-16 16:38:26 +02001229#ifdef Py_DEBUG
1230 if (!check_maxchar
1231 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1232 {
1233 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1234 Py_UCS4 ch;
1235 Py_ssize_t i;
1236 for (i=0; i < how_many; i++) {
1237 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1238 assert(ch <= to_maxchar);
1239 }
1240 }
1241#endif
1242
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001243 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001244 if (check_maxchar
1245 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1246 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001247 /* Writing Latin-1 characters into an ASCII string requires to
1248 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001249 Py_UCS4 max_char;
1250 max_char = ucs1lib_find_max_char(from_data,
1251 (Py_UCS1*)from_data + how_many);
1252 if (max_char >= 128)
1253 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001254 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001255 Py_MEMCPY((char*)to_data + to_kind * to_start,
1256 (char*)from_data + from_kind * from_start,
1257 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001258 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001259 else if (from_kind == PyUnicode_1BYTE_KIND
1260 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001261 {
1262 _PyUnicode_CONVERT_BYTES(
1263 Py_UCS1, Py_UCS2,
1264 PyUnicode_1BYTE_DATA(from) + from_start,
1265 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1266 PyUnicode_2BYTE_DATA(to) + to_start
1267 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001268 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001269 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001270 && to_kind == PyUnicode_4BYTE_KIND)
1271 {
1272 _PyUnicode_CONVERT_BYTES(
1273 Py_UCS1, Py_UCS4,
1274 PyUnicode_1BYTE_DATA(from) + from_start,
1275 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1276 PyUnicode_4BYTE_DATA(to) + to_start
1277 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001278 }
1279 else if (from_kind == PyUnicode_2BYTE_KIND
1280 && to_kind == PyUnicode_4BYTE_KIND)
1281 {
1282 _PyUnicode_CONVERT_BYTES(
1283 Py_UCS2, Py_UCS4,
1284 PyUnicode_2BYTE_DATA(from) + from_start,
1285 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1286 PyUnicode_4BYTE_DATA(to) + to_start
1287 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001288 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001289 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001290 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1291
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001292 if (!check_maxchar) {
1293 if (from_kind == PyUnicode_2BYTE_KIND
1294 && to_kind == PyUnicode_1BYTE_KIND)
1295 {
1296 _PyUnicode_CONVERT_BYTES(
1297 Py_UCS2, Py_UCS1,
1298 PyUnicode_2BYTE_DATA(from) + from_start,
1299 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1300 PyUnicode_1BYTE_DATA(to) + to_start
1301 );
1302 }
1303 else if (from_kind == PyUnicode_4BYTE_KIND
1304 && to_kind == PyUnicode_1BYTE_KIND)
1305 {
1306 _PyUnicode_CONVERT_BYTES(
1307 Py_UCS4, Py_UCS1,
1308 PyUnicode_4BYTE_DATA(from) + from_start,
1309 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1310 PyUnicode_1BYTE_DATA(to) + to_start
1311 );
1312 }
1313 else if (from_kind == PyUnicode_4BYTE_KIND
1314 && to_kind == PyUnicode_2BYTE_KIND)
1315 {
1316 _PyUnicode_CONVERT_BYTES(
1317 Py_UCS4, Py_UCS2,
1318 PyUnicode_4BYTE_DATA(from) + from_start,
1319 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1320 PyUnicode_2BYTE_DATA(to) + to_start
1321 );
1322 }
1323 else {
1324 assert(0);
1325 return -1;
1326 }
1327 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001328 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001329 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001330 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001331 Py_ssize_t i;
1332
Victor Stinnera0702ab2011-09-29 14:14:38 +02001333 for (i=0; i < how_many; i++) {
1334 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001335 if (ch > to_maxchar)
1336 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001337 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1338 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001339 }
1340 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001341 return 0;
1342}
1343
Victor Stinnerd3f08822012-05-29 12:57:52 +02001344void
1345_PyUnicode_FastCopyCharacters(
1346 PyObject *to, Py_ssize_t to_start,
1347 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001348{
1349 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1350}
1351
1352Py_ssize_t
1353PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1354 PyObject *from, Py_ssize_t from_start,
1355 Py_ssize_t how_many)
1356{
1357 int err;
1358
1359 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1360 PyErr_BadInternalCall();
1361 return -1;
1362 }
1363
Benjamin Petersonbac79492012-01-14 13:34:47 -05001364 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001365 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001366 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001367 return -1;
1368
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001369 if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001370 PyErr_SetString(PyExc_IndexError, "string index out of range");
1371 return -1;
1372 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001373 if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001374 PyErr_SetString(PyExc_IndexError, "string index out of range");
1375 return -1;
1376 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001377 if (how_many < 0) {
1378 PyErr_SetString(PyExc_SystemError, "how_many cannot be negative");
1379 return -1;
1380 }
1381 how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001382 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1383 PyErr_Format(PyExc_SystemError,
Victor Stinnera33bce02014-07-04 22:47:46 +02001384 "Cannot write %zi characters at %zi "
1385 "in a string of %zi characters",
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001386 how_many, to_start, PyUnicode_GET_LENGTH(to));
1387 return -1;
1388 }
1389
1390 if (how_many == 0)
1391 return 0;
1392
Victor Stinner488fa492011-12-12 00:01:39 +01001393 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001394 return -1;
1395
1396 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1397 if (err) {
1398 PyErr_Format(PyExc_SystemError,
1399 "Cannot copy %s characters "
1400 "into a string of %s characters",
1401 unicode_kind_name(from),
1402 unicode_kind_name(to));
1403 return -1;
1404 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001405 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001406}
1407
Victor Stinner17222162011-09-28 22:15:37 +02001408/* Find the maximum code point and count the number of surrogate pairs so a
1409 correct string length can be computed before converting a string to UCS4.
1410 This function counts single surrogates as a character and not as a pair.
1411
1412 Return 0 on success, or -1 on error. */
1413static int
1414find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1415 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001416{
1417 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001418 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001419
Victor Stinnerc53be962011-10-02 21:33:54 +02001420 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001421 *num_surrogates = 0;
1422 *maxchar = 0;
1423
1424 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001425#if SIZEOF_WCHAR_T == 2
Victor Stinnercf77da92013-03-06 01:09:24 +01001426 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1427 && (iter+1) < end
1428 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1429 {
1430 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1431 ++(*num_surrogates);
1432 iter += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001433 }
1434 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001435#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001436 {
1437 ch = *iter;
1438 iter++;
1439 }
1440 if (ch > *maxchar) {
1441 *maxchar = ch;
1442 if (*maxchar > MAX_UNICODE) {
1443 PyErr_Format(PyExc_ValueError,
1444 "character U+%x is not in range [U+0000; U+10ffff]",
1445 ch);
1446 return -1;
1447 }
1448 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001449 }
1450 return 0;
1451}
1452
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001453int
1454_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001455{
1456 wchar_t *end;
1457 Py_UCS4 maxchar = 0;
1458 Py_ssize_t num_surrogates;
1459#if SIZEOF_WCHAR_T == 2
1460 Py_ssize_t length_wo_surrogates;
1461#endif
1462
Georg Brandl7597add2011-10-05 16:36:47 +02001463 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001464 strings were created using _PyObject_New() and where no canonical
1465 representation (the str field) has been set yet aka strings
1466 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001467 assert(_PyUnicode_CHECK(unicode));
1468 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001469 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001470 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001471 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001472 /* Actually, it should neither be interned nor be anything else: */
1473 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001474
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001475 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001476 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001477 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001478 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001479
1480 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001481 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1482 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001483 PyErr_NoMemory();
1484 return -1;
1485 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001486 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001487 _PyUnicode_WSTR(unicode), end,
1488 PyUnicode_1BYTE_DATA(unicode));
1489 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1490 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1491 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1492 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001493 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001494 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001495 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001496 }
1497 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001498 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001499 _PyUnicode_UTF8(unicode) = NULL;
1500 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001501 }
1502 PyObject_FREE(_PyUnicode_WSTR(unicode));
1503 _PyUnicode_WSTR(unicode) = NULL;
1504 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1505 }
1506 /* In this case we might have to convert down from 4-byte native
1507 wchar_t to 2-byte unicode. */
1508 else if (maxchar < 65536) {
1509 assert(num_surrogates == 0 &&
1510 "FindMaxCharAndNumSurrogatePairs() messed up");
1511
Victor Stinner506f5922011-09-28 22:34:18 +02001512#if SIZEOF_WCHAR_T == 2
1513 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001514 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001515 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1516 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1517 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001518 _PyUnicode_UTF8(unicode) = NULL;
1519 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001520#else
1521 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001522 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001523 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001524 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001525 PyErr_NoMemory();
1526 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001527 }
Victor Stinner506f5922011-09-28 22:34:18 +02001528 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1529 _PyUnicode_WSTR(unicode), end,
1530 PyUnicode_2BYTE_DATA(unicode));
1531 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1532 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1533 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001534 _PyUnicode_UTF8(unicode) = NULL;
1535 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001536 PyObject_FREE(_PyUnicode_WSTR(unicode));
1537 _PyUnicode_WSTR(unicode) = NULL;
1538 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1539#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001540 }
1541 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1542 else {
1543#if SIZEOF_WCHAR_T == 2
1544 /* in case the native representation is 2-bytes, we need to allocate a
1545 new normalized 4-byte version. */
1546 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Serhiy Storchakae55181f2015-02-20 21:34:06 +02001547 if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) {
1548 PyErr_NoMemory();
1549 return -1;
1550 }
Victor Stinnerc3c74152011-10-02 20:39:55 +02001551 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1552 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001553 PyErr_NoMemory();
1554 return -1;
1555 }
1556 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1557 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001558 _PyUnicode_UTF8(unicode) = NULL;
1559 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001560 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1561 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001562 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001563 PyObject_FREE(_PyUnicode_WSTR(unicode));
1564 _PyUnicode_WSTR(unicode) = NULL;
1565 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1566#else
1567 assert(num_surrogates == 0);
1568
Victor Stinnerc3c74152011-10-02 20:39:55 +02001569 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001570 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001571 _PyUnicode_UTF8(unicode) = NULL;
1572 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001573 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1574#endif
1575 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1576 }
1577 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001578 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001579 return 0;
1580}
1581
Alexander Belopolsky40018472011-02-26 01:02:56 +00001582static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001583unicode_dealloc(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001584{
Walter Dörwald16807132007-05-25 13:52:07 +00001585 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001586 case SSTATE_NOT_INTERNED:
1587 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001588
Benjamin Peterson29060642009-01-31 22:14:21 +00001589 case SSTATE_INTERNED_MORTAL:
1590 /* revive dead object temporarily for DelItem */
1591 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001592 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001593 Py_FatalError(
1594 "deletion of interned string failed");
1595 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001596
Benjamin Peterson29060642009-01-31 22:14:21 +00001597 case SSTATE_INTERNED_IMMORTAL:
1598 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001599
Benjamin Peterson29060642009-01-31 22:14:21 +00001600 default:
1601 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001602 }
1603
Victor Stinner03490912011-10-03 23:45:12 +02001604 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001605 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001606 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001607 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001608 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1609 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001610
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001611 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001612}
1613
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001614#ifdef Py_DEBUG
1615static int
1616unicode_is_singleton(PyObject *unicode)
1617{
1618 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1619 if (unicode == unicode_empty)
1620 return 1;
1621 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1622 {
1623 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1624 if (ch < 256 && unicode_latin1[ch] == unicode)
1625 return 1;
1626 }
1627 return 0;
1628}
1629#endif
1630
Alexander Belopolsky40018472011-02-26 01:02:56 +00001631static int
Victor Stinner488fa492011-12-12 00:01:39 +01001632unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001633{
Victor Stinner488fa492011-12-12 00:01:39 +01001634 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001635 if (Py_REFCNT(unicode) != 1)
1636 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001637 if (_PyUnicode_HASH(unicode) != -1)
1638 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001639 if (PyUnicode_CHECK_INTERNED(unicode))
1640 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001641 if (!PyUnicode_CheckExact(unicode))
1642 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001643#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001644 /* singleton refcount is greater than 1 */
1645 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001646#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001647 return 1;
1648}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001649
Victor Stinnerfe226c02011-10-03 03:52:20 +02001650static int
1651unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1652{
1653 PyObject *unicode;
1654 Py_ssize_t old_length;
1655
1656 assert(p_unicode != NULL);
1657 unicode = *p_unicode;
1658
1659 assert(unicode != NULL);
1660 assert(PyUnicode_Check(unicode));
1661 assert(0 <= length);
1662
Victor Stinner910337b2011-10-03 03:20:16 +02001663 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001664 old_length = PyUnicode_WSTR_LENGTH(unicode);
1665 else
1666 old_length = PyUnicode_GET_LENGTH(unicode);
1667 if (old_length == length)
1668 return 0;
1669
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001670 if (length == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02001671 _Py_INCREF_UNICODE_EMPTY();
1672 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00001673 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03001674 Py_SETREF(*p_unicode, unicode_empty);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001675 return 0;
1676 }
1677
Victor Stinner488fa492011-12-12 00:01:39 +01001678 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001679 PyObject *copy = resize_copy(unicode, length);
1680 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001681 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03001682 Py_SETREF(*p_unicode, copy);
Benjamin Peterson29060642009-01-31 22:14:21 +00001683 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001684 }
1685
Victor Stinnerfe226c02011-10-03 03:52:20 +02001686 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001687 PyObject *new_unicode = resize_compact(unicode, length);
1688 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001689 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001690 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001691 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001692 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001693 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001694}
1695
Alexander Belopolsky40018472011-02-26 01:02:56 +00001696int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001697PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001698{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001699 PyObject *unicode;
1700 if (p_unicode == NULL) {
1701 PyErr_BadInternalCall();
1702 return -1;
1703 }
1704 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001705 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001706 {
1707 PyErr_BadInternalCall();
1708 return -1;
1709 }
1710 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001711}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001712
Serhiy Storchakad65c9492015-11-02 14:10:23 +02001713/* Copy an ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01001714
Victor Stinnerb429d3b2012-02-22 21:22:20 +01001715 WARNING: The function doesn't copy the terminating null character and
1716 doesn't check the maximum character (may write a latin1 character in an
1717 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02001718static void
1719unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1720 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01001721{
1722 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1723 void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02001724 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01001725
1726 switch (kind) {
1727 case PyUnicode_1BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01001728 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner8c6db452012-10-06 00:40:45 +02001729#ifdef Py_DEBUG
1730 if (PyUnicode_IS_ASCII(unicode)) {
1731 Py_UCS4 maxchar = ucs1lib_find_max_char(
1732 (const Py_UCS1*)str,
1733 (const Py_UCS1*)str + len);
1734 assert(maxchar < 128);
1735 }
1736#endif
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01001737 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02001738 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001739 }
1740 case PyUnicode_2BYTE_KIND: {
1741 Py_UCS2 *start = (Py_UCS2 *)data + index;
1742 Py_UCS2 *ucs2 = start;
1743 assert(index <= PyUnicode_GET_LENGTH(unicode));
1744
Victor Stinner184252a2012-06-16 02:57:41 +02001745 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001746 *ucs2 = (Py_UCS2)*str;
1747
1748 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02001749 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001750 }
1751 default: {
1752 Py_UCS4 *start = (Py_UCS4 *)data + index;
1753 Py_UCS4 *ucs4 = start;
1754 assert(kind == PyUnicode_4BYTE_KIND);
1755 assert(index <= PyUnicode_GET_LENGTH(unicode));
1756
Victor Stinner184252a2012-06-16 02:57:41 +02001757 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001758 *ucs4 = (Py_UCS4)*str;
1759
1760 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01001761 }
1762 }
1763}
1764
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001765static PyObject*
1766get_latin1_char(unsigned char ch)
1767{
Victor Stinnera464fc12011-10-02 20:39:30 +02001768 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001769 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001770 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001771 if (!unicode)
1772 return NULL;
1773 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001774 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001775 unicode_latin1[ch] = unicode;
1776 }
1777 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001778 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001779}
1780
Victor Stinner985a82a2014-01-03 12:53:47 +01001781static PyObject*
1782unicode_char(Py_UCS4 ch)
1783{
1784 PyObject *unicode;
1785
1786 assert(ch <= MAX_UNICODE);
1787
Victor Stinnerf3b46b42014-01-03 13:16:00 +01001788 if (ch < 256)
1789 return get_latin1_char(ch);
1790
Victor Stinner985a82a2014-01-03 12:53:47 +01001791 unicode = PyUnicode_New(1, ch);
1792 if (unicode == NULL)
1793 return NULL;
1794 switch (PyUnicode_KIND(unicode)) {
1795 case PyUnicode_1BYTE_KIND:
1796 PyUnicode_1BYTE_DATA(unicode)[0] = (Py_UCS1)ch;
1797 break;
1798 case PyUnicode_2BYTE_KIND:
1799 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
1800 break;
1801 default:
1802 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1803 PyUnicode_4BYTE_DATA(unicode)[0] = ch;
1804 }
1805 assert(_PyUnicode_CheckConsistency(unicode, 1));
1806 return unicode;
1807}
1808
Alexander Belopolsky40018472011-02-26 01:02:56 +00001809PyObject *
1810PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001811{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001812 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001813 Py_UCS4 maxchar = 0;
1814 Py_ssize_t num_surrogates;
1815
1816 if (u == NULL)
1817 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001818
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001819 /* If the Unicode data is known at construction time, we can apply
1820 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001821
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001822 /* Optimization for empty strings */
Serhiy Storchaka678db842013-01-26 12:16:36 +02001823 if (size == 0)
1824 _Py_RETURN_UNICODE_EMPTY();
Tim Petersced69f82003-09-16 20:30:58 +00001825
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001826 /* Single character Unicode objects in the Latin-1 range are
1827 shared when using this constructor */
Victor Stinnerd21b58c2013-02-26 00:15:54 +01001828 if (size == 1 && (Py_UCS4)*u < 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001829 return get_latin1_char((unsigned char)*u);
1830
1831 /* If not empty and not single character, copy the Unicode data
1832 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001833 if (find_maxchar_surrogates(u, u + size,
1834 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001835 return NULL;
1836
Victor Stinner8faf8212011-12-08 22:14:11 +01001837 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001838 if (!unicode)
1839 return NULL;
1840
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001841 switch (PyUnicode_KIND(unicode)) {
1842 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001843 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001844 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1845 break;
1846 case PyUnicode_2BYTE_KIND:
1847#if Py_UNICODE_SIZE == 2
1848 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1849#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001850 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001851 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1852#endif
1853 break;
1854 case PyUnicode_4BYTE_KIND:
1855#if SIZEOF_WCHAR_T == 2
1856 /* This is the only case which has to process surrogates, thus
1857 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001858 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001859#else
1860 assert(num_surrogates == 0);
1861 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1862#endif
1863 break;
1864 default:
1865 assert(0 && "Impossible state");
1866 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001867
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001868 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001869}
1870
Alexander Belopolsky40018472011-02-26 01:02:56 +00001871PyObject *
1872PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001873{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001874 if (size < 0) {
1875 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001876 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001877 return NULL;
1878 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001879 if (u != NULL)
1880 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
1881 else
1882 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001883}
1884
Alexander Belopolsky40018472011-02-26 01:02:56 +00001885PyObject *
1886PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001887{
1888 size_t size = strlen(u);
1889 if (size > PY_SSIZE_T_MAX) {
1890 PyErr_SetString(PyExc_OverflowError, "input too long");
1891 return NULL;
1892 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001893 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001894}
1895
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001896PyObject *
1897_PyUnicode_FromId(_Py_Identifier *id)
1898{
1899 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01001900 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
1901 strlen(id->string),
1902 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001903 if (!id->object)
1904 return NULL;
1905 PyUnicode_InternInPlace(&id->object);
1906 assert(!id->next);
1907 id->next = static_strings;
1908 static_strings = id;
1909 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001910 return id->object;
1911}
1912
1913void
1914_PyUnicode_ClearStaticStrings()
1915{
Benjamin Peterson0c270a82013-01-09 09:52:01 -06001916 _Py_Identifier *tmp, *s = static_strings;
1917 while (s) {
Serhiy Storchaka505ff752014-02-09 13:33:53 +02001918 Py_CLEAR(s->object);
Benjamin Peterson0c270a82013-01-09 09:52:01 -06001919 tmp = s->next;
1920 s->next = NULL;
1921 s = tmp;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001922 }
Benjamin Peterson0c270a82013-01-09 09:52:01 -06001923 static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001924}
1925
Benjamin Peterson0df54292012-03-26 14:50:32 -04001926/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001927
Victor Stinnerd3f08822012-05-29 12:57:52 +02001928PyObject*
1929_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02001930{
Victor Stinnerd3f08822012-05-29 12:57:52 +02001931 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01001932 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01001933 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02001934#ifdef Py_DEBUG
Victor Stinnerd21b58c2013-02-26 00:15:54 +01001935 assert((unsigned char)s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02001936#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001937 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01001938 }
Victor Stinner785938e2011-12-11 20:09:03 +01001939 unicode = PyUnicode_New(size, 127);
1940 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02001941 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01001942 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
1943 assert(_PyUnicode_CheckConsistency(unicode, 1));
1944 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02001945}
1946
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001947static Py_UCS4
1948kind_maxchar_limit(unsigned int kind)
1949{
Benjamin Petersonead6b532011-12-20 17:23:42 -06001950 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001951 case PyUnicode_1BYTE_KIND:
1952 return 0x80;
1953 case PyUnicode_2BYTE_KIND:
1954 return 0x100;
1955 case PyUnicode_4BYTE_KIND:
1956 return 0x10000;
1957 default:
1958 assert(0 && "invalid kind");
Victor Stinner8faf8212011-12-08 22:14:11 +01001959 return MAX_UNICODE;
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001960 }
1961}
1962
Victor Stinnere6abb482012-05-02 01:15:40 +02001963Py_LOCAL_INLINE(Py_UCS4)
1964align_maxchar(Py_UCS4 maxchar)
1965{
1966 if (maxchar <= 127)
1967 return 127;
1968 else if (maxchar <= 255)
1969 return 255;
1970 else if (maxchar <= 65535)
1971 return 65535;
1972 else
1973 return MAX_UNICODE;
1974}
1975
Victor Stinner702c7342011-10-05 13:50:52 +02001976static PyObject*
Victor Stinnerd21b58c2013-02-26 00:15:54 +01001977_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001978{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001979 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001980 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001981
Serhiy Storchaka678db842013-01-26 12:16:36 +02001982 if (size == 0)
1983 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001984 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001985 if (size == 1)
1986 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001987
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001988 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001989 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001990 if (!res)
1991 return NULL;
1992 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001993 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001994 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001995}
1996
Victor Stinnere57b1c02011-09-28 22:20:48 +02001997static PyObject*
1998_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001999{
2000 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002001 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002002
Serhiy Storchaka678db842013-01-26 12:16:36 +02002003 if (size == 0)
2004 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002005 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002006 if (size == 1)
2007 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002008
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002009 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002010 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002011 if (!res)
2012 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002013 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002014 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002015 else {
2016 _PyUnicode_CONVERT_BYTES(
2017 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2018 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002019 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002020 return res;
2021}
2022
Victor Stinnere57b1c02011-09-28 22:20:48 +02002023static PyObject*
2024_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002025{
2026 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002027 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002028
Serhiy Storchaka678db842013-01-26 12:16:36 +02002029 if (size == 0)
2030 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002031 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002032 if (size == 1)
2033 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002034
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002035 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002036 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002037 if (!res)
2038 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002039 if (max_char < 256)
2040 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2041 PyUnicode_1BYTE_DATA(res));
2042 else if (max_char < 0x10000)
2043 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2044 PyUnicode_2BYTE_DATA(res));
2045 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002046 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002047 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002048 return res;
2049}
2050
2051PyObject*
2052PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2053{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002054 if (size < 0) {
2055 PyErr_SetString(PyExc_ValueError, "size must be positive");
2056 return NULL;
2057 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002058 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002059 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002060 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002061 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002062 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002063 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002064 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002065 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002066 PyErr_SetString(PyExc_SystemError, "invalid kind");
2067 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002068 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002069}
2070
Victor Stinnerece58de2012-04-23 23:36:38 +02002071Py_UCS4
2072_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2073{
2074 enum PyUnicode_Kind kind;
2075 void *startptr, *endptr;
2076
2077 assert(PyUnicode_IS_READY(unicode));
2078 assert(0 <= start);
2079 assert(end <= PyUnicode_GET_LENGTH(unicode));
2080 assert(start <= end);
2081
2082 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2083 return PyUnicode_MAX_CHAR_VALUE(unicode);
2084
2085 if (start == end)
2086 return 127;
2087
Victor Stinner94d558b2012-04-27 22:26:58 +02002088 if (PyUnicode_IS_ASCII(unicode))
2089 return 127;
2090
Victor Stinnerece58de2012-04-23 23:36:38 +02002091 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002092 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002093 endptr = (char *)startptr + end * kind;
2094 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002095 switch(kind) {
2096 case PyUnicode_1BYTE_KIND:
2097 return ucs1lib_find_max_char(startptr, endptr);
2098 case PyUnicode_2BYTE_KIND:
2099 return ucs2lib_find_max_char(startptr, endptr);
2100 case PyUnicode_4BYTE_KIND:
2101 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002102 default:
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002103 assert(0);
2104 return 0;
Victor Stinnerece58de2012-04-23 23:36:38 +02002105 }
2106}
2107
Victor Stinner25a4b292011-10-06 12:31:55 +02002108/* Ensure that a string uses the most efficient storage, if it is not the
2109 case: create a new string with of the right kind. Write NULL into *p_unicode
2110 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002111static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002112unicode_adjust_maxchar(PyObject **p_unicode)
2113{
2114 PyObject *unicode, *copy;
2115 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002116 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002117 unsigned int kind;
2118
2119 assert(p_unicode != NULL);
2120 unicode = *p_unicode;
2121 assert(PyUnicode_IS_READY(unicode));
2122 if (PyUnicode_IS_ASCII(unicode))
2123 return;
2124
2125 len = PyUnicode_GET_LENGTH(unicode);
2126 kind = PyUnicode_KIND(unicode);
2127 if (kind == PyUnicode_1BYTE_KIND) {
2128 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002129 max_char = ucs1lib_find_max_char(u, u + len);
2130 if (max_char >= 128)
2131 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002132 }
2133 else if (kind == PyUnicode_2BYTE_KIND) {
2134 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002135 max_char = ucs2lib_find_max_char(u, u + len);
2136 if (max_char >= 256)
2137 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002138 }
2139 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002140 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02002141 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002142 max_char = ucs4lib_find_max_char(u, u + len);
2143 if (max_char >= 0x10000)
2144 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002145 }
Victor Stinner25a4b292011-10-06 12:31:55 +02002146 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002147 if (copy != NULL)
2148 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002149 Py_DECREF(unicode);
2150 *p_unicode = copy;
2151}
2152
Victor Stinner034f6cf2011-09-30 02:26:44 +02002153PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002154_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002155{
Victor Stinner87af4f22011-11-21 23:03:47 +01002156 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002157 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002158
Victor Stinner034f6cf2011-09-30 02:26:44 +02002159 if (!PyUnicode_Check(unicode)) {
2160 PyErr_BadInternalCall();
2161 return NULL;
2162 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002163 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002164 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002165
Victor Stinner87af4f22011-11-21 23:03:47 +01002166 length = PyUnicode_GET_LENGTH(unicode);
2167 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002168 if (!copy)
2169 return NULL;
2170 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2171
Victor Stinner87af4f22011-11-21 23:03:47 +01002172 Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2173 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002174 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002175 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002176}
2177
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002178
Victor Stinnerbc603d12011-10-02 01:00:40 +02002179/* Widen Unicode objects to larger buffers. Don't write terminating null
2180 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002181
2182void*
2183_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2184{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002185 Py_ssize_t len;
2186 void *result;
2187 unsigned int skind;
2188
Benjamin Petersonbac79492012-01-14 13:34:47 -05002189 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02002190 return NULL;
2191
2192 len = PyUnicode_GET_LENGTH(s);
2193 skind = PyUnicode_KIND(s);
2194 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002195 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002196 return NULL;
2197 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002198 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002199 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002200 result = PyMem_New(Py_UCS2, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002201 if (!result)
2202 return PyErr_NoMemory();
2203 assert(skind == PyUnicode_1BYTE_KIND);
2204 _PyUnicode_CONVERT_BYTES(
2205 Py_UCS1, Py_UCS2,
2206 PyUnicode_1BYTE_DATA(s),
2207 PyUnicode_1BYTE_DATA(s) + len,
2208 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002209 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002210 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002211 result = PyMem_New(Py_UCS4, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002212 if (!result)
2213 return PyErr_NoMemory();
2214 if (skind == PyUnicode_2BYTE_KIND) {
2215 _PyUnicode_CONVERT_BYTES(
2216 Py_UCS2, Py_UCS4,
2217 PyUnicode_2BYTE_DATA(s),
2218 PyUnicode_2BYTE_DATA(s) + len,
2219 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002220 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002221 else {
2222 assert(skind == PyUnicode_1BYTE_KIND);
2223 _PyUnicode_CONVERT_BYTES(
2224 Py_UCS1, Py_UCS4,
2225 PyUnicode_1BYTE_DATA(s),
2226 PyUnicode_1BYTE_DATA(s) + len,
2227 result);
2228 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002229 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002230 default:
2231 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002232 }
Victor Stinner01698042011-10-04 00:04:26 +02002233 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002234 return NULL;
2235}
2236
2237static Py_UCS4*
2238as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2239 int copy_null)
2240{
2241 int kind;
2242 void *data;
2243 Py_ssize_t len, targetlen;
2244 if (PyUnicode_READY(string) == -1)
2245 return NULL;
2246 kind = PyUnicode_KIND(string);
2247 data = PyUnicode_DATA(string);
2248 len = PyUnicode_GET_LENGTH(string);
2249 targetlen = len;
2250 if (copy_null)
2251 targetlen++;
2252 if (!target) {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002253 target = PyMem_New(Py_UCS4, targetlen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002254 if (!target) {
2255 PyErr_NoMemory();
2256 return NULL;
2257 }
2258 }
2259 else {
2260 if (targetsize < targetlen) {
2261 PyErr_Format(PyExc_SystemError,
2262 "string is longer than the buffer");
2263 if (copy_null && 0 < targetsize)
2264 target[0] = 0;
2265 return NULL;
2266 }
2267 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002268 if (kind == PyUnicode_1BYTE_KIND) {
2269 Py_UCS1 *start = (Py_UCS1 *) data;
2270 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002271 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002272 else if (kind == PyUnicode_2BYTE_KIND) {
2273 Py_UCS2 *start = (Py_UCS2 *) data;
2274 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2275 }
2276 else {
2277 assert(kind == PyUnicode_4BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002278 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002279 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002280 if (copy_null)
2281 target[len] = 0;
2282 return target;
2283}
2284
2285Py_UCS4*
2286PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2287 int copy_null)
2288{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002289 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002290 PyErr_BadInternalCall();
2291 return NULL;
2292 }
2293 return as_ucs4(string, target, targetsize, copy_null);
2294}
2295
2296Py_UCS4*
2297PyUnicode_AsUCS4Copy(PyObject *string)
2298{
2299 return as_ucs4(string, NULL, 0, 1);
2300}
2301
2302#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00002303
Alexander Belopolsky40018472011-02-26 01:02:56 +00002304PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02002305PyUnicode_FromWideChar(const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002306{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002307 if (w == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002308 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02002309 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson29060642009-01-31 22:14:21 +00002310 PyErr_BadInternalCall();
2311 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002312 }
2313
Martin v. Löwis790465f2008-04-05 20:41:37 +00002314 if (size == -1) {
2315 size = wcslen(w);
2316 }
2317
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002318 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002319}
2320
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002321#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00002322
Victor Stinner15a11362012-10-06 23:48:20 +02002323/* maximum number of characters required for output of %lld or %p.
Victor Stinnere215d962012-10-06 23:03:36 +02002324 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2325 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2326#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
Victor Stinner96865452011-03-01 23:44:09 +00002327
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002328static int
2329unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2330 Py_ssize_t width, Py_ssize_t precision)
2331{
2332 Py_ssize_t length, fill, arglen;
2333 Py_UCS4 maxchar;
2334
2335 if (PyUnicode_READY(str) == -1)
2336 return -1;
2337
2338 length = PyUnicode_GET_LENGTH(str);
2339 if ((precision == -1 || precision >= length)
2340 && width <= length)
2341 return _PyUnicodeWriter_WriteStr(writer, str);
2342
2343 if (precision != -1)
2344 length = Py_MIN(precision, length);
2345
2346 arglen = Py_MAX(length, width);
2347 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2348 maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2349 else
2350 maxchar = writer->maxchar;
2351
2352 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2353 return -1;
2354
2355 if (width > length) {
2356 fill = width - length;
2357 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2358 return -1;
2359 writer->pos += fill;
2360 }
2361
2362 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2363 str, 0, length);
2364 writer->pos += length;
2365 return 0;
2366}
2367
2368static int
2369unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
2370 Py_ssize_t width, Py_ssize_t precision)
2371{
2372 /* UTF-8 */
2373 Py_ssize_t length;
2374 PyObject *unicode;
2375 int res;
2376
2377 length = strlen(str);
2378 if (precision != -1)
2379 length = Py_MIN(length, precision);
2380 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2381 if (unicode == NULL)
2382 return -1;
2383
2384 res = unicode_fromformat_write_str(writer, unicode, width, -1);
2385 Py_DECREF(unicode);
2386 return res;
2387}
2388
Victor Stinner96865452011-03-01 23:44:09 +00002389static const char*
Victor Stinnere215d962012-10-06 23:03:36 +02002390unicode_fromformat_arg(_PyUnicodeWriter *writer,
2391 const char *f, va_list *vargs)
Victor Stinner96865452011-03-01 23:44:09 +00002392{
Victor Stinnere215d962012-10-06 23:03:36 +02002393 const char *p;
2394 Py_ssize_t len;
2395 int zeropad;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002396 Py_ssize_t width;
2397 Py_ssize_t precision;
Victor Stinnere215d962012-10-06 23:03:36 +02002398 int longflag;
2399 int longlongflag;
2400 int size_tflag;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002401 Py_ssize_t fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002402
2403 p = f;
2404 f++;
Victor Stinner4c63a972012-10-06 23:55:33 +02002405 zeropad = 0;
2406 if (*f == '0') {
2407 zeropad = 1;
2408 f++;
2409 }
Victor Stinner96865452011-03-01 23:44:09 +00002410
2411 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002412 width = -1;
2413 if (Py_ISDIGIT((unsigned)*f)) {
2414 width = *f - '0';
Victor Stinner96865452011-03-01 23:44:09 +00002415 f++;
Victor Stinnere215d962012-10-06 23:03:36 +02002416 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002417 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
Victor Stinner3921e902012-10-06 23:05:00 +02002418 PyErr_SetString(PyExc_ValueError,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002419 "width too big");
Victor Stinner3921e902012-10-06 23:05:00 +02002420 return NULL;
2421 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002422 width = (width * 10) + (*f - '0');
Victor Stinnere215d962012-10-06 23:03:36 +02002423 f++;
2424 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002425 }
2426 precision = -1;
2427 if (*f == '.') {
2428 f++;
2429 if (Py_ISDIGIT((unsigned)*f)) {
2430 precision = (*f - '0');
2431 f++;
2432 while (Py_ISDIGIT((unsigned)*f)) {
2433 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2434 PyErr_SetString(PyExc_ValueError,
2435 "precision too big");
2436 return NULL;
2437 }
2438 precision = (precision * 10) + (*f - '0');
2439 f++;
2440 }
2441 }
Victor Stinner96865452011-03-01 23:44:09 +00002442 if (*f == '%') {
2443 /* "%.3%s" => f points to "3" */
2444 f--;
2445 }
2446 }
2447 if (*f == '\0') {
Victor Stinnere215d962012-10-06 23:03:36 +02002448 /* bogus format "%.123" => go backward, f points to "3" */
Victor Stinner96865452011-03-01 23:44:09 +00002449 f--;
2450 }
Victor Stinner96865452011-03-01 23:44:09 +00002451
2452 /* Handle %ld, %lu, %lld and %llu. */
2453 longflag = 0;
2454 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002455 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002456 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002457 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002458 longflag = 1;
2459 ++f;
2460 }
2461#ifdef HAVE_LONG_LONG
2462 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002463 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002464 longlongflag = 1;
2465 f += 2;
2466 }
2467#endif
2468 }
2469 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002470 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002471 size_tflag = 1;
2472 ++f;
2473 }
Victor Stinnere215d962012-10-06 23:03:36 +02002474
2475 if (f[1] == '\0')
2476 writer->overallocate = 0;
2477
2478 switch (*f) {
2479 case 'c':
2480 {
2481 int ordinal = va_arg(*vargs, int);
Victor Stinnerff5a8482012-10-06 23:05:45 +02002482 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Serhiy Storchakac89533f2013-06-23 20:21:16 +03002483 PyErr_SetString(PyExc_OverflowError,
Victor Stinnerff5a8482012-10-06 23:05:45 +02002484 "character argument not in range(0x110000)");
2485 return NULL;
2486 }
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002487 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002488 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002489 break;
2490 }
2491
2492 case 'i':
2493 case 'd':
2494 case 'u':
2495 case 'x':
2496 {
2497 /* used by sprintf */
Victor Stinner15a11362012-10-06 23:48:20 +02002498 char buffer[MAX_LONG_LONG_CHARS];
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002499 Py_ssize_t arglen;
Victor Stinnere215d962012-10-06 23:03:36 +02002500
2501 if (*f == 'u') {
Victor Stinnere215d962012-10-06 23:03:36 +02002502 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002503 len = sprintf(buffer, "%lu",
Victor Stinnere215d962012-10-06 23:03:36 +02002504 va_arg(*vargs, unsigned long));
2505#ifdef HAVE_LONG_LONG
2506 else if (longlongflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002507 len = sprintf(buffer, "%" PY_FORMAT_LONG_LONG "u",
Victor Stinnere215d962012-10-06 23:03:36 +02002508 va_arg(*vargs, unsigned PY_LONG_LONG));
2509#endif
2510 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002511 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "u",
Victor Stinnere215d962012-10-06 23:03:36 +02002512 va_arg(*vargs, size_t));
2513 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002514 len = sprintf(buffer, "%u",
Victor Stinnere215d962012-10-06 23:03:36 +02002515 va_arg(*vargs, unsigned int));
2516 }
2517 else if (*f == 'x') {
Victor Stinner3aa979e2014-11-18 21:40:51 +01002518 len = sprintf(buffer, "%x", va_arg(*vargs, int));
Victor Stinnere215d962012-10-06 23:03:36 +02002519 }
2520 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002521 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002522 len = sprintf(buffer, "%li",
Victor Stinnere215d962012-10-06 23:03:36 +02002523 va_arg(*vargs, long));
2524#ifdef HAVE_LONG_LONG
2525 else if (longlongflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002526 len = sprintf(buffer, "%" PY_FORMAT_LONG_LONG "i",
Victor Stinnere215d962012-10-06 23:03:36 +02002527 va_arg(*vargs, PY_LONG_LONG));
2528#endif
2529 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002530 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "i",
Victor Stinnere215d962012-10-06 23:03:36 +02002531 va_arg(*vargs, Py_ssize_t));
2532 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002533 len = sprintf(buffer, "%i",
Victor Stinnere215d962012-10-06 23:03:36 +02002534 va_arg(*vargs, int));
2535 }
2536 assert(len >= 0);
2537
Victor Stinnere215d962012-10-06 23:03:36 +02002538 if (precision < len)
2539 precision = len;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002540
2541 arglen = Py_MAX(precision, width);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002542 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2543 return NULL;
2544
Victor Stinnere215d962012-10-06 23:03:36 +02002545 if (width > precision) {
2546 Py_UCS4 fillchar;
2547 fill = width - precision;
2548 fillchar = zeropad?'0':' ';
Victor Stinner15a11362012-10-06 23:48:20 +02002549 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2550 return NULL;
2551 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002552 }
Victor Stinner15a11362012-10-06 23:48:20 +02002553 if (precision > len) {
Victor Stinnere215d962012-10-06 23:03:36 +02002554 fill = precision - len;
Victor Stinner15a11362012-10-06 23:48:20 +02002555 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2556 return NULL;
2557 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002558 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002559
Victor Stinner4a587072013-11-19 12:54:53 +01002560 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2561 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002562 break;
2563 }
2564
2565 case 'p':
2566 {
2567 char number[MAX_LONG_LONG_CHARS];
2568
2569 len = sprintf(number, "%p", va_arg(*vargs, void*));
2570 assert(len >= 0);
2571
2572 /* %p is ill-defined: ensure leading 0x. */
2573 if (number[1] == 'X')
2574 number[1] = 'x';
2575 else if (number[1] != 'x') {
2576 memmove(number + 2, number,
2577 strlen(number) + 1);
2578 number[0] = '0';
2579 number[1] = 'x';
2580 len += 2;
2581 }
2582
Victor Stinner4a587072013-11-19 12:54:53 +01002583 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002584 return NULL;
2585 break;
2586 }
2587
2588 case 's':
2589 {
2590 /* UTF-8 */
2591 const char *s = va_arg(*vargs, const char*);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002592 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002593 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002594 break;
2595 }
2596
2597 case 'U':
2598 {
2599 PyObject *obj = va_arg(*vargs, PyObject *);
2600 assert(obj && _PyUnicode_CHECK(obj));
2601
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002602 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002603 return NULL;
2604 break;
2605 }
2606
2607 case 'V':
2608 {
2609 PyObject *obj = va_arg(*vargs, PyObject *);
2610 const char *str = va_arg(*vargs, const char *);
Victor Stinnere215d962012-10-06 23:03:36 +02002611 if (obj) {
2612 assert(_PyUnicode_CHECK(obj));
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002613 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002614 return NULL;
2615 }
2616 else {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002617 assert(str != NULL);
2618 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002619 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002620 }
2621 break;
2622 }
2623
2624 case 'S':
2625 {
2626 PyObject *obj = va_arg(*vargs, PyObject *);
2627 PyObject *str;
2628 assert(obj);
2629 str = PyObject_Str(obj);
2630 if (!str)
2631 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002632 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002633 Py_DECREF(str);
2634 return NULL;
2635 }
2636 Py_DECREF(str);
2637 break;
2638 }
2639
2640 case 'R':
2641 {
2642 PyObject *obj = va_arg(*vargs, PyObject *);
2643 PyObject *repr;
2644 assert(obj);
2645 repr = PyObject_Repr(obj);
2646 if (!repr)
2647 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002648 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002649 Py_DECREF(repr);
2650 return NULL;
2651 }
2652 Py_DECREF(repr);
2653 break;
2654 }
2655
2656 case 'A':
2657 {
2658 PyObject *obj = va_arg(*vargs, PyObject *);
2659 PyObject *ascii;
2660 assert(obj);
2661 ascii = PyObject_ASCII(obj);
2662 if (!ascii)
2663 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002664 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002665 Py_DECREF(ascii);
2666 return NULL;
2667 }
2668 Py_DECREF(ascii);
2669 break;
2670 }
2671
2672 case '%':
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002673 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002674 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002675 break;
2676
2677 default:
2678 /* if we stumble upon an unknown formatting code, copy the rest
2679 of the format string to the output string. (we cannot just
2680 skip the code, since there's no way to know what's in the
2681 argument list) */
2682 len = strlen(p);
Victor Stinner4a587072013-11-19 12:54:53 +01002683 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002684 return NULL;
2685 f = p+len;
2686 return f;
2687 }
2688
2689 f++;
Victor Stinner96865452011-03-01 23:44:09 +00002690 return f;
2691}
2692
Walter Dörwaldd2034312007-05-18 16:29:38 +00002693PyObject *
2694PyUnicode_FromFormatV(const char *format, va_list vargs)
2695{
Victor Stinnere215d962012-10-06 23:03:36 +02002696 va_list vargs2;
2697 const char *f;
2698 _PyUnicodeWriter writer;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002699
Victor Stinner8f674cc2013-04-17 23:02:17 +02002700 _PyUnicodeWriter_Init(&writer);
2701 writer.min_length = strlen(format) + 100;
2702 writer.overallocate = 1;
Victor Stinnere215d962012-10-06 23:03:36 +02002703
2704 /* va_list may be an array (of 1 item) on some platforms (ex: AMD64).
2705 Copy it to be able to pass a reference to a subfunction. */
2706 Py_VA_COPY(vargs2, vargs);
2707
2708 for (f = format; *f; ) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002709 if (*f == '%') {
Victor Stinnere215d962012-10-06 23:03:36 +02002710 f = unicode_fromformat_arg(&writer, f, &vargs2);
2711 if (f == NULL)
2712 goto fail;
Victor Stinner1205f272010-09-11 00:54:47 +00002713 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002714 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002715 const char *p;
2716 Py_ssize_t len;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002717
Victor Stinnere215d962012-10-06 23:03:36 +02002718 p = f;
2719 do
2720 {
2721 if ((unsigned char)*p > 127) {
2722 PyErr_Format(PyExc_ValueError,
2723 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2724 "string, got a non-ASCII byte: 0x%02x",
2725 (unsigned char)*p);
Victor Stinner1ddf53d2016-09-21 14:13:14 +02002726 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02002727 }
2728 p++;
2729 }
2730 while (*p != '\0' && *p != '%');
2731 len = p - f;
2732
2733 if (*p == '\0')
2734 writer.overallocate = 0;
Victor Stinner4a587072013-11-19 12:54:53 +01002735
2736 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002737 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02002738
2739 f = p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002740 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002741 }
Victor Stinnere215d962012-10-06 23:03:36 +02002742 return _PyUnicodeWriter_Finish(&writer);
2743
2744 fail:
2745 _PyUnicodeWriter_Dealloc(&writer);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002746 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002747}
2748
Walter Dörwaldd2034312007-05-18 16:29:38 +00002749PyObject *
2750PyUnicode_FromFormat(const char *format, ...)
2751{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002752 PyObject* ret;
2753 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002754
2755#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002756 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002757#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002758 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002759#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002760 ret = PyUnicode_FromFormatV(format, vargs);
2761 va_end(vargs);
2762 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002763}
2764
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002765#ifdef HAVE_WCHAR_H
2766
Victor Stinner5593d8a2010-10-02 11:11:27 +00002767/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2768 convert a Unicode object to a wide character string.
2769
Victor Stinnerd88d9832011-09-06 02:00:05 +02002770 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002771 character) required to convert the unicode object. Ignore size argument.
2772
Victor Stinnerd88d9832011-09-06 02:00:05 +02002773 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002774 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002775 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002776static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002777unicode_aswidechar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002778 wchar_t *w,
2779 Py_ssize_t size)
2780{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002781 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002782 const wchar_t *wstr;
2783
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002784 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002785 if (wstr == NULL)
2786 return -1;
2787
Victor Stinner5593d8a2010-10-02 11:11:27 +00002788 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002789 if (size > res)
2790 size = res + 1;
2791 else
2792 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002793 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002794 return res;
2795 }
2796 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002797 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002798}
2799
2800Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002801PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002802 wchar_t *w,
2803 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002804{
2805 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002806 PyErr_BadInternalCall();
2807 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002808 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002809 return unicode_aswidechar(unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002810}
2811
Victor Stinner137c34c2010-09-29 10:25:54 +00002812wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002813PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002814 Py_ssize_t *size)
2815{
2816 wchar_t* buffer;
2817 Py_ssize_t buflen;
2818
2819 if (unicode == NULL) {
2820 PyErr_BadInternalCall();
2821 return NULL;
2822 }
2823
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002824 buflen = unicode_aswidechar(unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002825 if (buflen == -1)
2826 return NULL;
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002827 buffer = PyMem_NEW(wchar_t, buflen);
Victor Stinner137c34c2010-09-29 10:25:54 +00002828 if (buffer == NULL) {
2829 PyErr_NoMemory();
2830 return NULL;
2831 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002832 buflen = unicode_aswidechar(unicode, buffer, buflen);
Stefan Krah8528c312012-08-19 21:52:43 +02002833 if (buflen == -1) {
2834 PyMem_FREE(buffer);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002835 return NULL;
Stefan Krah8528c312012-08-19 21:52:43 +02002836 }
Victor Stinner5593d8a2010-10-02 11:11:27 +00002837 if (size != NULL)
2838 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002839 return buffer;
2840}
2841
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002842#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002843
Alexander Belopolsky40018472011-02-26 01:02:56 +00002844PyObject *
2845PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002846{
Victor Stinner8faf8212011-12-08 22:14:11 +01002847 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002848 PyErr_SetString(PyExc_ValueError,
2849 "chr() arg not in range(0x110000)");
2850 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002851 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002852
Victor Stinner985a82a2014-01-03 12:53:47 +01002853 return unicode_char((Py_UCS4)ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002854}
2855
Alexander Belopolsky40018472011-02-26 01:02:56 +00002856PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02002857PyUnicode_FromObject(PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002858{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002859 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002860 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002861 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05002862 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002863 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002864 Py_INCREF(obj);
2865 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002866 }
2867 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002868 /* For a Unicode subtype that's not a Unicode object,
2869 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002870 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002871 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002872 PyErr_Format(PyExc_TypeError,
2873 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002874 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002875 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002876}
2877
Alexander Belopolsky40018472011-02-26 01:02:56 +00002878PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02002879PyUnicode_FromEncodedObject(PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002880 const char *encoding,
2881 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002882{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002883 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002884 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002885
Guido van Rossumd57fd912000-03-10 22:53:23 +00002886 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002887 PyErr_BadInternalCall();
2888 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002889 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002890
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002891 /* Decoding bytes objects is the most common case and should be fast */
2892 if (PyBytes_Check(obj)) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02002893 if (PyBytes_GET_SIZE(obj) == 0)
2894 _Py_RETURN_UNICODE_EMPTY();
2895 v = PyUnicode_Decode(
2896 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2897 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002898 return v;
2899 }
2900
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002901 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002902 PyErr_SetString(PyExc_TypeError,
2903 "decoding str is not supported");
2904 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002905 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002906
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002907 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2908 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2909 PyErr_Format(PyExc_TypeError,
Serhiy Storchakab757c832014-12-05 22:25:22 +02002910 "coercing to str: need a bytes-like object, %.80s found",
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002911 Py_TYPE(obj)->tp_name);
2912 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002913 }
Tim Petersced69f82003-09-16 20:30:58 +00002914
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002915 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02002916 PyBuffer_Release(&buffer);
2917 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00002918 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002919
Serhiy Storchaka05997252013-01-26 12:14:02 +02002920 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002921 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002922 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002923}
2924
Victor Stinner600d3be2010-06-10 12:00:55 +00002925/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002926 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2927 1 on success. */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01002928int
2929_Py_normalize_encoding(const char *encoding,
2930 char *lower,
2931 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002932{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002933 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002934 char *l;
2935 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002936
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04002937 if (encoding == NULL) {
Victor Stinner66b32702013-11-07 23:12:23 +01002938 /* 6 == strlen("utf-8") + 1 */
Victor Stinnerdf23e302013-11-07 13:33:36 +01002939 if (lower_len < 6)
2940 return 0;
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04002941 strcpy(lower, "utf-8");
2942 return 1;
2943 }
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002944 e = encoding;
2945 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002946 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002947 while (*e) {
2948 if (l == l_end)
2949 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002950 if (Py_ISUPPER(*e)) {
2951 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002952 }
2953 else if (*e == '_') {
2954 *l++ = '-';
2955 e++;
2956 }
2957 else {
2958 *l++ = *e++;
2959 }
2960 }
2961 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002962 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002963}
2964
Alexander Belopolsky40018472011-02-26 01:02:56 +00002965PyObject *
2966PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002967 Py_ssize_t size,
2968 const char *encoding,
2969 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00002970{
2971 PyObject *buffer = NULL, *unicode;
2972 Py_buffer info;
2973 char lower[11]; /* Enough for any encoding shortcut */
2974
Fred Drakee4315f52000-05-09 19:53:39 +00002975 /* Shortcuts for common default encodings */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01002976 if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002977 if ((strcmp(lower, "utf-8") == 0) ||
2978 (strcmp(lower, "utf8") == 0))
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002979 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
Victor Stinner37296e82010-06-10 13:36:23 +00002980 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002981 (strcmp(lower, "latin1") == 0) ||
Victor Stinnerfa3ba4c2013-10-29 11:34:05 +01002982 (strcmp(lower, "iso-8859-1") == 0) ||
2983 (strcmp(lower, "iso8859-1") == 0))
Victor Stinner37296e82010-06-10 13:36:23 +00002984 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002985#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002986 else if (strcmp(lower, "mbcs") == 0)
2987 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002988#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002989 else if (strcmp(lower, "ascii") == 0)
2990 return PyUnicode_DecodeASCII(s, size, errors);
2991 else if (strcmp(lower, "utf-16") == 0)
2992 return PyUnicode_DecodeUTF16(s, size, errors, 0);
2993 else if (strcmp(lower, "utf-32") == 0)
2994 return PyUnicode_DecodeUTF32(s, size, errors, 0);
2995 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002996
2997 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002998 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00002999 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003000 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003001 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003002 if (buffer == NULL)
3003 goto onError;
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003004 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003005 if (unicode == NULL)
3006 goto onError;
3007 if (!PyUnicode_Check(unicode)) {
3008 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003009 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3010 "use codecs.decode() to decode to arbitrary types",
3011 encoding,
Benjamin Peterson8d761ff2016-10-16 15:41:46 -07003012 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003013 Py_DECREF(unicode);
3014 goto onError;
3015 }
3016 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003017 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003018
Benjamin Peterson29060642009-01-31 22:14:21 +00003019 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003020 Py_XDECREF(buffer);
3021 return NULL;
3022}
3023
Alexander Belopolsky40018472011-02-26 01:02:56 +00003024PyObject *
3025PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003026 const char *encoding,
3027 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003028{
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003029 if (!PyUnicode_Check(unicode)) {
3030 PyErr_BadArgument();
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003031 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003032 }
3033
3034 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003035 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003036
3037 /* Decode via the codec registry */
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003038 return PyCodec_Decode(unicode, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003039}
3040
Alexander Belopolsky40018472011-02-26 01:02:56 +00003041PyObject *
3042PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003043 const char *encoding,
3044 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003045{
3046 PyObject *v;
3047
3048 if (!PyUnicode_Check(unicode)) {
3049 PyErr_BadArgument();
3050 goto onError;
3051 }
3052
3053 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003054 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003055
3056 /* Decode via the codec registry */
3057 v = PyCodec_Decode(unicode, encoding, errors);
3058 if (v == NULL)
3059 goto onError;
3060 if (!PyUnicode_Check(v)) {
3061 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003062 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3063 "use codecs.decode() to decode to arbitrary types",
3064 encoding,
Benjamin Peterson8d761ff2016-10-16 15:41:46 -07003065 Py_TYPE(unicode)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003066 Py_DECREF(v);
3067 goto onError;
3068 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003069 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003070
Benjamin Peterson29060642009-01-31 22:14:21 +00003071 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003072 return NULL;
3073}
3074
Alexander Belopolsky40018472011-02-26 01:02:56 +00003075PyObject *
3076PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003077 Py_ssize_t size,
3078 const char *encoding,
3079 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003080{
3081 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003082
Guido van Rossumd57fd912000-03-10 22:53:23 +00003083 unicode = PyUnicode_FromUnicode(s, size);
3084 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003085 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003086 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3087 Py_DECREF(unicode);
3088 return v;
3089}
3090
Alexander Belopolsky40018472011-02-26 01:02:56 +00003091PyObject *
3092PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003093 const char *encoding,
3094 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003095{
3096 PyObject *v;
3097
3098 if (!PyUnicode_Check(unicode)) {
3099 PyErr_BadArgument();
3100 goto onError;
3101 }
3102
3103 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003104 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003105
3106 /* Encode via the codec registry */
3107 v = PyCodec_Encode(unicode, encoding, errors);
3108 if (v == NULL)
3109 goto onError;
3110 return v;
3111
Benjamin Peterson29060642009-01-31 22:14:21 +00003112 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003113 return NULL;
3114}
3115
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003116static size_t
3117wcstombs_errorpos(const wchar_t *wstr)
3118{
3119 size_t len;
3120#if SIZEOF_WCHAR_T == 2
3121 wchar_t buf[3];
3122#else
3123 wchar_t buf[2];
3124#endif
3125 char outbuf[MB_LEN_MAX];
3126 const wchar_t *start, *previous;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003127
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003128#if SIZEOF_WCHAR_T == 2
3129 buf[2] = 0;
3130#else
3131 buf[1] = 0;
3132#endif
3133 start = wstr;
3134 while (*wstr != L'\0')
3135 {
3136 previous = wstr;
3137#if SIZEOF_WCHAR_T == 2
3138 if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
3139 && Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
3140 {
3141 buf[0] = wstr[0];
3142 buf[1] = wstr[1];
3143 wstr += 2;
3144 }
3145 else {
3146 buf[0] = *wstr;
3147 buf[1] = 0;
3148 wstr++;
3149 }
3150#else
3151 buf[0] = *wstr;
3152 wstr++;
3153#endif
3154 len = wcstombs(outbuf, buf, sizeof(outbuf));
Victor Stinner2f197072011-12-17 07:08:30 +01003155 if (len == (size_t)-1)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003156 return previous - start;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003157 }
3158
3159 /* failed to find the unencodable character */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003160 return 0;
3161}
3162
Victor Stinner1b579672011-12-17 05:47:23 +01003163static int
3164locale_error_handler(const char *errors, int *surrogateescape)
3165{
3166 if (errors == NULL) {
3167 *surrogateescape = 0;
3168 return 0;
3169 }
3170
3171 if (strcmp(errors, "strict") == 0) {
3172 *surrogateescape = 0;
3173 return 0;
3174 }
Victor Stinner8dbd4212012-12-04 09:30:24 +01003175 if (strcmp(errors, "surrogateescape") == 0) {
Victor Stinner1b579672011-12-17 05:47:23 +01003176 *surrogateescape = 1;
3177 return 0;
3178 }
3179 PyErr_Format(PyExc_ValueError,
3180 "only 'strict' and 'surrogateescape' error handlers "
3181 "are supported, not '%s'",
3182 errors);
3183 return -1;
3184}
3185
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003186PyObject *
Victor Stinner1b579672011-12-17 05:47:23 +01003187PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003188{
3189 Py_ssize_t wlen, wlen2;
3190 wchar_t *wstr;
3191 PyObject *bytes = NULL;
3192 char *errmsg;
Raymond Hettingere56666d2013-08-04 11:51:03 -07003193 PyObject *reason = NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003194 PyObject *exc;
3195 size_t error_pos;
Victor Stinner1b579672011-12-17 05:47:23 +01003196 int surrogateescape;
3197
3198 if (locale_error_handler(errors, &surrogateescape) < 0)
3199 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003200
3201 wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3202 if (wstr == NULL)
3203 return NULL;
3204
3205 wlen2 = wcslen(wstr);
3206 if (wlen2 != wlen) {
3207 PyMem_Free(wstr);
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003208 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003209 return NULL;
3210 }
3211
3212 if (surrogateescape) {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003213 /* "surrogateescape" error handler */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003214 char *str;
3215
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003216 str = Py_EncodeLocale(wstr, &error_pos);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003217 if (str == NULL) {
3218 if (error_pos == (size_t)-1) {
3219 PyErr_NoMemory();
3220 PyMem_Free(wstr);
3221 return NULL;
3222 }
3223 else {
3224 goto encode_error;
3225 }
3226 }
3227 PyMem_Free(wstr);
3228
3229 bytes = PyBytes_FromString(str);
3230 PyMem_Free(str);
3231 }
3232 else {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003233 /* strict mode */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003234 size_t len, len2;
3235
3236 len = wcstombs(NULL, wstr, 0);
3237 if (len == (size_t)-1) {
Victor Stinner2f197072011-12-17 07:08:30 +01003238 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003239 goto encode_error;
3240 }
3241
3242 bytes = PyBytes_FromStringAndSize(NULL, len);
3243 if (bytes == NULL) {
3244 PyMem_Free(wstr);
3245 return NULL;
3246 }
3247
3248 len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
3249 if (len2 == (size_t)-1 || len2 > len) {
Victor Stinner2f197072011-12-17 07:08:30 +01003250 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003251 goto encode_error;
3252 }
3253 PyMem_Free(wstr);
3254 }
3255 return bytes;
3256
3257encode_error:
3258 errmsg = strerror(errno);
3259 assert(errmsg != NULL);
Victor Stinner2f197072011-12-17 07:08:30 +01003260
3261 if (error_pos == (size_t)-1)
3262 error_pos = wcstombs_errorpos(wstr);
3263
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003264 PyMem_Free(wstr);
3265 Py_XDECREF(bytes);
3266
Victor Stinner2f197072011-12-17 07:08:30 +01003267 if (errmsg != NULL) {
3268 size_t errlen;
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003269 wstr = Py_DecodeLocale(errmsg, &errlen);
Victor Stinner2f197072011-12-17 07:08:30 +01003270 if (wstr != NULL) {
3271 reason = PyUnicode_FromWideChar(wstr, errlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003272 PyMem_RawFree(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003273 } else
3274 errmsg = NULL;
3275 }
3276 if (errmsg == NULL)
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003277 reason = PyUnicode_FromString(
3278 "wcstombs() encountered an unencodable "
3279 "wide character");
3280 if (reason == NULL)
3281 return NULL;
3282
3283 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO",
3284 "locale", unicode,
3285 (Py_ssize_t)error_pos,
3286 (Py_ssize_t)(error_pos+1),
3287 reason);
3288 Py_DECREF(reason);
3289 if (exc != NULL) {
3290 PyCodec_StrictErrors(exc);
3291 Py_XDECREF(exc);
3292 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003293 return NULL;
3294}
3295
Victor Stinnerad158722010-10-27 00:25:46 +00003296PyObject *
3297PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003298{
Victor Stinner99b95382011-07-04 14:23:54 +02003299#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003300 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003301#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003302 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00003303#else
Victor Stinner793b5312011-04-27 00:24:21 +02003304 PyInterpreterState *interp = PyThreadState_GET()->interp;
3305 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3306 cannot use it to encode and decode filenames before it is loaded. Load
3307 the Python codec requires to encode at least its own filename. Use the C
3308 version of the locale codec until the codec registry is initialized and
3309 the Python codec is loaded.
3310
3311 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3312 cannot only rely on it: check also interp->fscodec_initialized for
3313 subinterpreters. */
3314 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003315 return PyUnicode_AsEncodedString(unicode,
3316 Py_FileSystemDefaultEncoding,
3317 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003318 }
3319 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003320 return PyUnicode_EncodeLocale(unicode, "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003321 }
Victor Stinnerad158722010-10-27 00:25:46 +00003322#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003323}
3324
Alexander Belopolsky40018472011-02-26 01:02:56 +00003325PyObject *
3326PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003327 const char *encoding,
3328 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003329{
3330 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00003331 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003332
Guido van Rossumd57fd912000-03-10 22:53:23 +00003333 if (!PyUnicode_Check(unicode)) {
3334 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003335 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003336 }
Fred Drakee4315f52000-05-09 19:53:39 +00003337
Fred Drakee4315f52000-05-09 19:53:39 +00003338 /* Shortcuts for common default encodings */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003339 if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003340 if ((strcmp(lower, "utf-8") == 0) ||
3341 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00003342 {
Victor Stinner2f283c22011-03-02 01:21:46 +00003343 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003344 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00003345 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003346 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00003347 }
Victor Stinner37296e82010-06-10 13:36:23 +00003348 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003349 (strcmp(lower, "latin1") == 0) ||
Victor Stinnerfa3ba4c2013-10-29 11:34:05 +01003350 (strcmp(lower, "iso-8859-1") == 0) ||
3351 (strcmp(lower, "iso8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003352 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003353#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003354 else if (strcmp(lower, "mbcs") == 0)
3355 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003356#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003357 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003358 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00003359 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003360
3361 /* Encode via the codec registry */
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003362 v = _PyCodec_EncodeText(unicode, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003363 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003364 return NULL;
3365
3366 /* The normal path */
3367 if (PyBytes_Check(v))
3368 return v;
3369
3370 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003371 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003372 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003373 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003374
3375 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003376 "encoder %s returned bytearray instead of bytes; "
3377 "use codecs.encode() to encode to arbitrary types",
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003378 encoding);
3379 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003380 Py_DECREF(v);
3381 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003382 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003383
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003384 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3385 Py_DECREF(v);
3386 return b;
3387 }
3388
3389 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003390 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
3391 "use codecs.encode() to encode to arbitrary types",
3392 encoding,
Benjamin Peterson8d761ff2016-10-16 15:41:46 -07003393 Py_TYPE(v)->tp_name);
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003394 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003395 return NULL;
3396}
3397
Alexander Belopolsky40018472011-02-26 01:02:56 +00003398PyObject *
3399PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003400 const char *encoding,
3401 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003402{
3403 PyObject *v;
3404
3405 if (!PyUnicode_Check(unicode)) {
3406 PyErr_BadArgument();
3407 goto onError;
3408 }
3409
3410 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003411 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003412
3413 /* Encode via the codec registry */
3414 v = PyCodec_Encode(unicode, encoding, errors);
3415 if (v == NULL)
3416 goto onError;
3417 if (!PyUnicode_Check(v)) {
3418 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003419 "'%.400s' encoder returned '%.400s' instead of 'str'; "
3420 "use codecs.encode() to encode to arbitrary types",
3421 encoding,
Benjamin Peterson8d761ff2016-10-16 15:41:46 -07003422 Py_TYPE(v)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003423 Py_DECREF(v);
3424 goto onError;
3425 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003426 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003427
Benjamin Peterson29060642009-01-31 22:14:21 +00003428 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003429 return NULL;
3430}
3431
Victor Stinner2f197072011-12-17 07:08:30 +01003432static size_t
3433mbstowcs_errorpos(const char *str, size_t len)
3434{
3435#ifdef HAVE_MBRTOWC
3436 const char *start = str;
3437 mbstate_t mbs;
3438 size_t converted;
3439 wchar_t ch;
3440
3441 memset(&mbs, 0, sizeof mbs);
3442 while (len)
3443 {
Serhiy Storchaka20b39b22014-09-28 11:27:24 +03003444 converted = mbrtowc(&ch, str, len, &mbs);
Victor Stinner2f197072011-12-17 07:08:30 +01003445 if (converted == 0)
3446 /* Reached end of string */
3447 break;
3448 if (converted == (size_t)-1 || converted == (size_t)-2) {
3449 /* Conversion error or incomplete character */
3450 return str - start;
3451 }
3452 else {
3453 str += converted;
3454 len -= converted;
3455 }
3456 }
3457 /* failed to find the undecodable byte sequence */
3458 return 0;
3459#endif
3460 return 0;
3461}
3462
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003463PyObject*
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003464PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
Victor Stinner1b579672011-12-17 05:47:23 +01003465 const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003466{
3467 wchar_t smallbuf[256];
3468 size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
3469 wchar_t *wstr;
3470 size_t wlen, wlen2;
3471 PyObject *unicode;
Victor Stinner1b579672011-12-17 05:47:23 +01003472 int surrogateescape;
Victor Stinner2f197072011-12-17 07:08:30 +01003473 size_t error_pos;
3474 char *errmsg;
Victor Stinner0c39b1b2015-03-18 15:02:06 +01003475 PyObject *reason = NULL; /* initialize to prevent gcc warning */
3476 PyObject *exc;
Victor Stinner1b579672011-12-17 05:47:23 +01003477
3478 if (locale_error_handler(errors, &surrogateescape) < 0)
3479 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003480
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003481 if (str[len] != '\0' || (size_t)len != strlen(str)) {
3482 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003483 return NULL;
3484 }
3485
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003486 if (surrogateescape) {
3487 /* "surrogateescape" error handler */
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003488 wstr = Py_DecodeLocale(str, &wlen);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003489 if (wstr == NULL) {
3490 if (wlen == (size_t)-1)
3491 PyErr_NoMemory();
3492 else
3493 PyErr_SetFromErrno(PyExc_OSError);
3494 return NULL;
3495 }
3496
3497 unicode = PyUnicode_FromWideChar(wstr, wlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003498 PyMem_RawFree(wstr);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003499 }
3500 else {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003501 /* strict mode */
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003502#ifndef HAVE_BROKEN_MBSTOWCS
3503 wlen = mbstowcs(NULL, str, 0);
3504#else
3505 wlen = len;
3506#endif
Victor Stinner2f197072011-12-17 07:08:30 +01003507 if (wlen == (size_t)-1)
3508 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003509 if (wlen+1 <= smallbuf_len) {
3510 wstr = smallbuf;
3511 }
3512 else {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02003513 wstr = PyMem_New(wchar_t, wlen+1);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003514 if (!wstr)
3515 return PyErr_NoMemory();
3516 }
3517
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003518 wlen2 = mbstowcs(wstr, str, wlen+1);
3519 if (wlen2 == (size_t)-1) {
3520 if (wstr != smallbuf)
3521 PyMem_Free(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003522 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003523 }
3524#ifdef HAVE_BROKEN_MBSTOWCS
3525 assert(wlen2 == wlen);
3526#endif
3527 unicode = PyUnicode_FromWideChar(wstr, wlen2);
3528 if (wstr != smallbuf)
3529 PyMem_Free(wstr);
3530 }
3531 return unicode;
Victor Stinner2f197072011-12-17 07:08:30 +01003532
3533decode_error:
Antoine Pitrouf6d1f1f2015-05-19 21:04:33 +02003534 reason = NULL;
Victor Stinner2f197072011-12-17 07:08:30 +01003535 errmsg = strerror(errno);
3536 assert(errmsg != NULL);
3537
3538 error_pos = mbstowcs_errorpos(str, len);
3539 if (errmsg != NULL) {
3540 size_t errlen;
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003541 wstr = Py_DecodeLocale(errmsg, &errlen);
Victor Stinner2f197072011-12-17 07:08:30 +01003542 if (wstr != NULL) {
3543 reason = PyUnicode_FromWideChar(wstr, errlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003544 PyMem_RawFree(wstr);
Antoine Pitrouf6d1f1f2015-05-19 21:04:33 +02003545 }
Victor Stinner2f197072011-12-17 07:08:30 +01003546 }
Antoine Pitrouf6d1f1f2015-05-19 21:04:33 +02003547 if (reason == NULL)
Victor Stinner2f197072011-12-17 07:08:30 +01003548 reason = PyUnicode_FromString(
3549 "mbstowcs() encountered an invalid multibyte sequence");
3550 if (reason == NULL)
3551 return NULL;
3552
3553 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO",
3554 "locale", str, len,
3555 (Py_ssize_t)error_pos,
3556 (Py_ssize_t)(error_pos+1),
3557 reason);
3558 Py_DECREF(reason);
3559 if (exc != NULL) {
3560 PyCodec_StrictErrors(exc);
3561 Py_XDECREF(exc);
3562 }
3563 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003564}
3565
3566PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003567PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003568{
3569 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner1b579672011-12-17 05:47:23 +01003570 return PyUnicode_DecodeLocaleAndSize(str, size, errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003571}
3572
3573
3574PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003575PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003576 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003577 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3578}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003579
Christian Heimes5894ba72007-11-04 11:43:14 +00003580PyObject*
3581PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3582{
Victor Stinner99b95382011-07-04 14:23:54 +02003583#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003584 return PyUnicode_DecodeMBCS(s, size, NULL);
3585#elif defined(__APPLE__)
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003586 return PyUnicode_DecodeUTF8Stateful(s, size, "surrogateescape", NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003587#else
Victor Stinner793b5312011-04-27 00:24:21 +02003588 PyInterpreterState *interp = PyThreadState_GET()->interp;
3589 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3590 cannot use it to encode and decode filenames before it is loaded. Load
3591 the Python codec requires to encode at least its own filename. Use the C
3592 version of the locale codec until the codec registry is initialized and
3593 the Python codec is loaded.
3594
3595 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3596 cannot only rely on it: check also interp->fscodec_initialized for
3597 subinterpreters. */
3598 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003599 return PyUnicode_Decode(s, size,
3600 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00003601 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003602 }
3603 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003604 return PyUnicode_DecodeLocaleAndSize(s, size, "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003605 }
Victor Stinnerad158722010-10-27 00:25:46 +00003606#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003607}
3608
Martin v. Löwis011e8422009-05-05 04:43:17 +00003609
3610int
3611PyUnicode_FSConverter(PyObject* arg, void* addr)
3612{
3613 PyObject *output = NULL;
3614 Py_ssize_t size;
3615 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003616 if (arg == NULL) {
3617 Py_DECREF(*(PyObject**)addr);
Benjamin Petersona4d33b32015-11-15 21:57:39 -08003618 *(PyObject**)addr = NULL;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003619 return 1;
3620 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00003621 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00003622 output = arg;
3623 Py_INCREF(output);
3624 }
3625 else {
3626 arg = PyUnicode_FromObject(arg);
3627 if (!arg)
3628 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00003629 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003630 Py_DECREF(arg);
3631 if (!output)
3632 return 0;
3633 if (!PyBytes_Check(output)) {
3634 Py_DECREF(output);
3635 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3636 return 0;
3637 }
3638 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00003639 size = PyBytes_GET_SIZE(output);
3640 data = PyBytes_AS_STRING(output);
Victor Stinner12174a52014-08-15 23:17:38 +02003641 if ((size_t)size != strlen(data)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003642 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003643 Py_DECREF(output);
3644 return 0;
3645 }
3646 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003647 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003648}
3649
3650
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003651int
3652PyUnicode_FSDecoder(PyObject* arg, void* addr)
3653{
3654 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003655 if (arg == NULL) {
3656 Py_DECREF(*(PyObject**)addr);
3657 return 1;
3658 }
3659 if (PyUnicode_Check(arg)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003660 if (PyUnicode_READY(arg) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003661 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003662 output = arg;
3663 Py_INCREF(output);
3664 }
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003665 else if (PyObject_CheckBuffer(arg)) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003666 arg = PyBytes_FromObject(arg);
3667 if (!arg)
3668 return 0;
3669 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3670 PyBytes_GET_SIZE(arg));
3671 Py_DECREF(arg);
3672 if (!output)
3673 return 0;
3674 if (!PyUnicode_Check(output)) {
3675 Py_DECREF(output);
3676 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3677 return 0;
3678 }
3679 }
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003680 else {
3681 PyErr_Format(PyExc_TypeError,
3682 "path should be string or bytes, not %.200s",
3683 Py_TYPE(arg)->tp_name);
3684 return 0;
3685 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003686 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003687 Py_DECREF(output);
3688 return 0;
3689 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003690 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003691 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003692 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003693 Py_DECREF(output);
3694 return 0;
3695 }
3696 *(PyObject**)addr = output;
3697 return Py_CLEANUP_SUPPORTED;
3698}
3699
3700
Martin v. Löwis5b222132007-06-10 09:51:05 +00003701char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003702PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003703{
Christian Heimesf3863112007-11-22 07:46:41 +00003704 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003705
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003706 if (!PyUnicode_Check(unicode)) {
3707 PyErr_BadArgument();
3708 return NULL;
3709 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003710 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003711 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003712
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003713 if (PyUnicode_UTF8(unicode) == NULL) {
3714 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003715 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3716 if (bytes == NULL)
3717 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003718 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3719 if (_PyUnicode_UTF8(unicode) == NULL) {
Victor Stinnera5afb582013-10-29 01:28:23 +01003720 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003721 Py_DECREF(bytes);
3722 return NULL;
3723 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003724 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3725 Py_MEMCPY(_PyUnicode_UTF8(unicode),
3726 PyBytes_AS_STRING(bytes),
3727 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003728 Py_DECREF(bytes);
3729 }
3730
3731 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003732 *psize = PyUnicode_UTF8_LENGTH(unicode);
3733 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003734}
3735
3736char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003737PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003738{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003739 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3740}
3741
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003742Py_UNICODE *
3743PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3744{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003745 const unsigned char *one_byte;
3746#if SIZEOF_WCHAR_T == 4
3747 const Py_UCS2 *two_bytes;
3748#else
3749 const Py_UCS4 *four_bytes;
3750 const Py_UCS4 *ucs4_end;
3751 Py_ssize_t num_surrogates;
3752#endif
3753 wchar_t *w;
3754 wchar_t *wchar_end;
3755
3756 if (!PyUnicode_Check(unicode)) {
3757 PyErr_BadArgument();
3758 return NULL;
3759 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003760 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003761 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003762 assert(_PyUnicode_KIND(unicode) != 0);
3763 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003764
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003765 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003766#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003767 four_bytes = PyUnicode_4BYTE_DATA(unicode);
3768 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003769 num_surrogates = 0;
3770
3771 for (; four_bytes < ucs4_end; ++four_bytes) {
3772 if (*four_bytes > 0xFFFF)
3773 ++num_surrogates;
3774 }
3775
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003776 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
3777 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
3778 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003779 PyErr_NoMemory();
3780 return NULL;
3781 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003782 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003783
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003784 w = _PyUnicode_WSTR(unicode);
3785 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
3786 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003787 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3788 if (*four_bytes > 0xFFFF) {
Victor Stinner8faf8212011-12-08 22:14:11 +01003789 assert(*four_bytes <= MAX_UNICODE);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003790 /* encode surrogate pair in this case */
Victor Stinner551ac952011-11-29 22:58:13 +01003791 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
3792 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003793 }
3794 else
3795 *w = *four_bytes;
3796
3797 if (w > wchar_end) {
3798 assert(0 && "Miscalculated string end");
3799 }
3800 }
3801 *w = 0;
3802#else
3803 /* sizeof(wchar_t) == 4 */
3804 Py_FatalError("Impossible unicode object state, wstr and str "
3805 "should share memory already.");
3806 return NULL;
3807#endif
3808 }
3809 else {
Serhiy Storchakae55181f2015-02-20 21:34:06 +02003810 if ((size_t)_PyUnicode_LENGTH(unicode) >
3811 PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
3812 PyErr_NoMemory();
3813 return NULL;
3814 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003815 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3816 (_PyUnicode_LENGTH(unicode) + 1));
3817 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003818 PyErr_NoMemory();
3819 return NULL;
3820 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003821 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
3822 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
3823 w = _PyUnicode_WSTR(unicode);
3824 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003825
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003826 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3827 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003828 for (; w < wchar_end; ++one_byte, ++w)
3829 *w = *one_byte;
3830 /* null-terminate the wstr */
3831 *w = 0;
3832 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003833 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003834#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003835 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003836 for (; w < wchar_end; ++two_bytes, ++w)
3837 *w = *two_bytes;
3838 /* null-terminate the wstr */
3839 *w = 0;
3840#else
3841 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003842 PyObject_FREE(_PyUnicode_WSTR(unicode));
3843 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003844 Py_FatalError("Impossible unicode object state, wstr "
3845 "and str should share memory already.");
3846 return NULL;
3847#endif
3848 }
3849 else {
3850 assert(0 && "This should never happen.");
3851 }
3852 }
3853 }
3854 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003855 *size = PyUnicode_WSTR_LENGTH(unicode);
3856 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003857}
3858
Alexander Belopolsky40018472011-02-26 01:02:56 +00003859Py_UNICODE *
3860PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003861{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003862 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003863}
3864
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003865
Alexander Belopolsky40018472011-02-26 01:02:56 +00003866Py_ssize_t
3867PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003868{
3869 if (!PyUnicode_Check(unicode)) {
3870 PyErr_BadArgument();
3871 goto onError;
3872 }
3873 return PyUnicode_GET_SIZE(unicode);
3874
Benjamin Peterson29060642009-01-31 22:14:21 +00003875 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003876 return -1;
3877}
3878
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003879Py_ssize_t
3880PyUnicode_GetLength(PyObject *unicode)
3881{
Victor Stinner07621332012-06-16 04:53:46 +02003882 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003883 PyErr_BadArgument();
3884 return -1;
3885 }
Victor Stinner07621332012-06-16 04:53:46 +02003886 if (PyUnicode_READY(unicode) == -1)
3887 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003888 return PyUnicode_GET_LENGTH(unicode);
3889}
3890
3891Py_UCS4
3892PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3893{
Victor Stinner69ed0f42013-04-09 21:48:24 +02003894 void *data;
3895 int kind;
3896
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003897 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3898 PyErr_BadArgument();
3899 return (Py_UCS4)-1;
3900 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01003901 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003902 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003903 return (Py_UCS4)-1;
3904 }
Victor Stinner69ed0f42013-04-09 21:48:24 +02003905 data = PyUnicode_DATA(unicode);
3906 kind = PyUnicode_KIND(unicode);
3907 return PyUnicode_READ(kind, data, index);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003908}
3909
3910int
3911PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3912{
3913 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003914 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003915 return -1;
3916 }
Victor Stinner488fa492011-12-12 00:01:39 +01003917 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01003918 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003919 PyErr_SetString(PyExc_IndexError, "string index out of range");
3920 return -1;
3921 }
Victor Stinner488fa492011-12-12 00:01:39 +01003922 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02003923 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01003924 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
3925 PyErr_SetString(PyExc_ValueError, "character out of range");
3926 return -1;
3927 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003928 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3929 index, ch);
3930 return 0;
3931}
3932
Alexander Belopolsky40018472011-02-26 01:02:56 +00003933const char *
3934PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00003935{
Victor Stinner42cb4622010-09-01 19:39:01 +00003936 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00003937}
3938
Victor Stinner554f3f02010-06-16 23:33:54 +00003939/* create or adjust a UnicodeDecodeError */
3940static void
3941make_decode_exception(PyObject **exceptionObject,
3942 const char *encoding,
3943 const char *input, Py_ssize_t length,
3944 Py_ssize_t startpos, Py_ssize_t endpos,
3945 const char *reason)
3946{
3947 if (*exceptionObject == NULL) {
3948 *exceptionObject = PyUnicodeDecodeError_Create(
3949 encoding, input, length, startpos, endpos, reason);
3950 }
3951 else {
3952 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
3953 goto onError;
3954 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
3955 goto onError;
3956 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
3957 goto onError;
3958 }
3959 return;
3960
3961onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02003962 Py_CLEAR(*exceptionObject);
Victor Stinner554f3f02010-06-16 23:33:54 +00003963}
3964
Victor Stinnerfc009ef2012-11-07 00:36:38 +01003965#ifdef HAVE_MBCS
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003966/* error handling callback helper:
3967 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00003968 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003969 and adjust various state variables.
3970 return 0 on success, -1 on error
3971*/
3972
Alexander Belopolsky40018472011-02-26 01:02:56 +00003973static int
Victor Stinnerfc009ef2012-11-07 00:36:38 +01003974unicode_decode_call_errorhandler_wchar(
3975 const char *errors, PyObject **errorHandler,
3976 const char *encoding, const char *reason,
3977 const char **input, const char **inend, Py_ssize_t *startinpos,
3978 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
3979 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003980{
Benjamin Peterson142957c2008-07-04 19:55:29 +00003981 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003982
3983 PyObject *restuple = NULL;
3984 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01003985 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003986 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003987 Py_ssize_t requiredsize;
3988 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003989 PyObject *inputobj = NULL;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01003990 wchar_t *repwstr;
3991 Py_ssize_t repwlen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003992
Victor Stinnerfc009ef2012-11-07 00:36:38 +01003993 assert (_PyUnicode_KIND(*output) == PyUnicode_WCHAR_KIND);
3994 outsize = _PyUnicode_WSTR_LENGTH(*output);
Victor Stinner596a6c42011-11-09 00:02:18 +01003995
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003996 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003997 *errorHandler = PyCodec_LookupError(errors);
3998 if (*errorHandler == NULL)
3999 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004000 }
4001
Victor Stinner554f3f02010-06-16 23:33:54 +00004002 make_decode_exception(exceptionObject,
4003 encoding,
4004 *input, *inend - *input,
4005 *startinpos, *endinpos,
4006 reason);
4007 if (*exceptionObject == NULL)
4008 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004009
4010 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4011 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004012 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004013 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00004014 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004015 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004016 }
4017 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004018 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004019
4020 /* Copy back the bytes variables, which might have been modified by the
4021 callback */
4022 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4023 if (!inputobj)
4024 goto onError;
4025 if (!PyBytes_Check(inputobj)) {
4026 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
4027 }
4028 *input = PyBytes_AS_STRING(inputobj);
4029 insize = PyBytes_GET_SIZE(inputobj);
4030 *inend = *input + insize;
4031 /* we can DECREF safely, as the exception has another reference,
4032 so the object won't go away. */
4033 Py_DECREF(inputobj);
4034
4035 if (newpos<0)
4036 newpos = insize+newpos;
4037 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004038 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004039 goto onError;
4040 }
4041
4042 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4043 if (repwstr == NULL)
4044 goto onError;
4045 /* need more space? (at least enough for what we
4046 have+the replacement+the rest of the string (starting
4047 at the new input position), so we won't have to check space
4048 when there are no errors in the rest of the string) */
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004049 requiredsize = *outpos;
4050 if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4051 goto overflow;
4052 requiredsize += repwlen;
4053 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4054 goto overflow;
4055 requiredsize += insize - newpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004056 if (requiredsize > outsize) {
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004057 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004058 requiredsize = 2*outsize;
4059 if (unicode_resize(output, requiredsize) < 0)
4060 goto onError;
4061 }
4062 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4063 *outpos += repwlen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004064 *endinpos = newpos;
4065 *inptr = *input + newpos;
4066
4067 /* we made it! */
4068 Py_XDECREF(restuple);
4069 return 0;
4070
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004071 overflow:
4072 PyErr_SetString(PyExc_OverflowError,
4073 "decoded result is too long for a Python string");
4074
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004075 onError:
4076 Py_XDECREF(restuple);
4077 return -1;
4078}
4079#endif /* HAVE_MBCS */
4080
4081static int
4082unicode_decode_call_errorhandler_writer(
4083 const char *errors, PyObject **errorHandler,
4084 const char *encoding, const char *reason,
4085 const char **input, const char **inend, Py_ssize_t *startinpos,
4086 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4087 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4088{
4089 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
4090
4091 PyObject *restuple = NULL;
4092 PyObject *repunicode = NULL;
4093 Py_ssize_t insize;
4094 Py_ssize_t newpos;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004095 Py_ssize_t replen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004096 PyObject *inputobj = NULL;
4097
4098 if (*errorHandler == NULL) {
4099 *errorHandler = PyCodec_LookupError(errors);
4100 if (*errorHandler == NULL)
4101 goto onError;
4102 }
4103
4104 make_decode_exception(exceptionObject,
4105 encoding,
4106 *input, *inend - *input,
4107 *startinpos, *endinpos,
4108 reason);
4109 if (*exceptionObject == NULL)
4110 goto onError;
4111
4112 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4113 if (restuple == NULL)
4114 goto onError;
4115 if (!PyTuple_Check(restuple)) {
4116 PyErr_SetString(PyExc_TypeError, &argparse[4]);
4117 goto onError;
4118 }
4119 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004120 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004121
4122 /* Copy back the bytes variables, which might have been modified by the
4123 callback */
4124 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4125 if (!inputobj)
4126 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00004127 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004128 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00004129 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004130 *input = PyBytes_AS_STRING(inputobj);
4131 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004132 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004133 /* we can DECREF safely, as the exception has another reference,
4134 so the object won't go away. */
4135 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004136
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004137 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004138 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004139 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004140 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00004141 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004142 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004143
Victor Stinner8f674cc2013-04-17 23:02:17 +02004144 if (PyUnicode_READY(repunicode) < 0)
4145 goto onError;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004146 replen = PyUnicode_GET_LENGTH(repunicode);
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004147 if (replen > 1) {
4148 writer->min_length += replen - 1;
Victor Stinner8f674cc2013-04-17 23:02:17 +02004149 writer->overallocate = 1;
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004150 if (_PyUnicodeWriter_Prepare(writer, writer->min_length,
4151 PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4152 goto onError;
4153 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004154 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
Victor Stinner376cfa12013-04-17 23:58:16 +02004155 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004156
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004157 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004158 *inptr = *input + newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004159
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004160 /* we made it! */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004161 Py_XDECREF(restuple);
4162 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004163
Benjamin Peterson29060642009-01-31 22:14:21 +00004164 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004165 Py_XDECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004166 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004167}
4168
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004169/* --- UTF-7 Codec -------------------------------------------------------- */
4170
Antoine Pitrou244651a2009-05-04 18:56:13 +00004171/* See RFC2152 for details. We encode conservatively and decode liberally. */
4172
4173/* Three simple macros defining base-64. */
4174
4175/* Is c a base-64 character? */
4176
4177#define IS_BASE64(c) \
4178 (((c) >= 'A' && (c) <= 'Z') || \
4179 ((c) >= 'a' && (c) <= 'z') || \
4180 ((c) >= '0' && (c) <= '9') || \
4181 (c) == '+' || (c) == '/')
4182
4183/* given that c is a base-64 character, what is its base-64 value? */
4184
4185#define FROM_BASE64(c) \
4186 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4187 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4188 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4189 (c) == '+' ? 62 : 63)
4190
4191/* What is the base-64 character of the bottom 6 bits of n? */
4192
4193#define TO_BASE64(n) \
4194 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4195
4196/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4197 * decoded as itself. We are permissive on decoding; the only ASCII
4198 * byte not decoding to itself is the + which begins a base64
4199 * string. */
4200
4201#define DECODE_DIRECT(c) \
4202 ((c) <= 127 && (c) != '+')
4203
4204/* The UTF-7 encoder treats ASCII characters differently according to
4205 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4206 * the above). See RFC2152. This array identifies these different
4207 * sets:
4208 * 0 : "Set D"
4209 * alphanumeric and '(),-./:?
4210 * 1 : "Set O"
4211 * !"#$%&*;<=>@[]^_`{|}
4212 * 2 : "whitespace"
4213 * ht nl cr sp
4214 * 3 : special (must be base64 encoded)
4215 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4216 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004217
Tim Petersced69f82003-09-16 20:30:58 +00004218static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004219char utf7_category[128] = {
4220/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4221 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4222/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4223 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4224/* sp ! " # $ % & ' ( ) * + , - . / */
4225 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4226/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4227 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4228/* @ A B C D E F G H I J K L M N O */
4229 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4230/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4231 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4232/* ` a b c d e f g h i j k l m n o */
4233 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4234/* p q r s t u v w x y z { | } ~ del */
4235 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004236};
4237
Antoine Pitrou244651a2009-05-04 18:56:13 +00004238/* ENCODE_DIRECT: this character should be encoded as itself. The
4239 * answer depends on whether we are encoding set O as itself, and also
4240 * on whether we are encoding whitespace as itself. RFC2152 makes it
4241 * clear that the answers to these questions vary between
4242 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004243
Antoine Pitrou244651a2009-05-04 18:56:13 +00004244#define ENCODE_DIRECT(c, directO, directWS) \
4245 ((c) < 128 && (c) > 0 && \
4246 ((utf7_category[(c)] == 0) || \
4247 (directWS && (utf7_category[(c)] == 2)) || \
4248 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004249
Alexander Belopolsky40018472011-02-26 01:02:56 +00004250PyObject *
4251PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004252 Py_ssize_t size,
4253 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004254{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004255 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4256}
4257
Antoine Pitrou244651a2009-05-04 18:56:13 +00004258/* The decoder. The only state we preserve is our read position,
4259 * i.e. how many characters we have consumed. So if we end in the
4260 * middle of a shift sequence we have to back off the read position
4261 * and the output to the beginning of the sequence, otherwise we lose
4262 * all the shift state (seen bits, number of bits seen, high
4263 * surrogate). */
4264
Alexander Belopolsky40018472011-02-26 01:02:56 +00004265PyObject *
4266PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004267 Py_ssize_t size,
4268 const char *errors,
4269 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004270{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004271 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004272 Py_ssize_t startinpos;
4273 Py_ssize_t endinpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004274 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004275 _PyUnicodeWriter writer;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004276 const char *errmsg = "";
4277 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004278 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004279 unsigned int base64bits = 0;
4280 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004281 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004282 PyObject *errorHandler = NULL;
4283 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004284
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004285 if (size == 0) {
4286 if (consumed)
4287 *consumed = 0;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004288 _Py_RETURN_UNICODE_EMPTY();
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004289 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004290
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004291 /* Start off assuming it's all ASCII. Widen later as necessary. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02004292 _PyUnicodeWriter_Init(&writer);
4293 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004294
4295 shiftOutStart = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004296 e = s + size;
4297
4298 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004299 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004300 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004301 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004302
Antoine Pitrou244651a2009-05-04 18:56:13 +00004303 if (inShift) { /* in a base-64 section */
4304 if (IS_BASE64(ch)) { /* consume a base-64 character */
4305 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4306 base64bits += 6;
4307 s++;
4308 if (base64bits >= 16) {
4309 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004310 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004311 base64bits -= 16;
4312 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004313 assert(outCh <= 0xffff);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004314 if (surrogate) {
4315 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004316 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4317 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004318 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004319 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004320 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004321 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004322 }
4323 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004324 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004325 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004326 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004327 }
4328 }
Victor Stinner551ac952011-11-29 22:58:13 +01004329 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004330 /* first surrogate */
4331 surrogate = outCh;
4332 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004333 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004334 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004335 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004336 }
4337 }
4338 }
4339 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004340 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004341 if (base64bits > 0) { /* left-over bits */
4342 if (base64bits >= 6) {
4343 /* We've seen at least one base-64 character */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004344 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004345 errmsg = "partial character in shift sequence";
4346 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004347 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004348 else {
4349 /* Some bits remain; they should be zero */
4350 if (base64buffer != 0) {
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004351 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004352 errmsg = "non-zero padding bits in shift sequence";
4353 goto utf7Error;
4354 }
4355 }
4356 }
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004357 if (surrogate && DECODE_DIRECT(ch)) {
4358 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4359 goto onError;
4360 }
4361 surrogate = 0;
4362 if (ch == '-') {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004363 /* '-' is absorbed; other terminating
4364 characters are preserved */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004365 s++;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004366 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004367 }
4368 }
4369 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004370 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004371 s++; /* consume '+' */
4372 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004373 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004374 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004375 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004376 }
4377 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004378 inShift = 1;
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004379 surrogate = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004380 shiftOutStart = writer.pos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004381 base64bits = 0;
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004382 base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004383 }
4384 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004385 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004386 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004387 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004388 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004389 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004390 else {
4391 startinpos = s-starts;
4392 s++;
4393 errmsg = "unexpected special character";
4394 goto utf7Error;
4395 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004396 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004397utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004398 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004399 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004400 errors, &errorHandler,
4401 "utf7", errmsg,
4402 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004403 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004404 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004405 }
4406
Antoine Pitrou244651a2009-05-04 18:56:13 +00004407 /* end of string */
4408
4409 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4410 /* if we're in an inconsistent state, that's an error */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004411 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004412 if (surrogate ||
4413 (base64bits >= 6) ||
4414 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004415 endinpos = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004416 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrou244651a2009-05-04 18:56:13 +00004417 errors, &errorHandler,
4418 "utf7", "unterminated shift sequence",
4419 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004420 &writer))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004421 goto onError;
4422 if (s < e)
4423 goto restart;
4424 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004425 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004426
4427 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004428 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004429 if (inShift) {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004430 *consumed = startinpos;
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004431 if (writer.pos != shiftOutStart && writer.maxchar > 127) {
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004432 PyObject *result = PyUnicode_FromKindAndData(
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004433 writer.kind, writer.data, shiftOutStart);
4434 Py_XDECREF(errorHandler);
4435 Py_XDECREF(exc);
4436 _PyUnicodeWriter_Dealloc(&writer);
4437 return result;
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004438 }
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004439 writer.pos = shiftOutStart; /* back off output */
Antoine Pitrou244651a2009-05-04 18:56:13 +00004440 }
4441 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004442 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004443 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004444 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004445
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004446 Py_XDECREF(errorHandler);
4447 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004448 return _PyUnicodeWriter_Finish(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004449
Benjamin Peterson29060642009-01-31 22:14:21 +00004450 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004451 Py_XDECREF(errorHandler);
4452 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004453 _PyUnicodeWriter_Dealloc(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004454 return NULL;
4455}
4456
4457
Alexander Belopolsky40018472011-02-26 01:02:56 +00004458PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004459_PyUnicode_EncodeUTF7(PyObject *str,
4460 int base64SetO,
4461 int base64WhiteSpace,
4462 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004463{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004464 int kind;
4465 void *data;
4466 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004467 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004468 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004469 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004470 unsigned int base64bits = 0;
4471 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004472 char * out;
4473 char * start;
4474
Benjamin Petersonbac79492012-01-14 13:34:47 -05004475 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004476 return NULL;
4477 kind = PyUnicode_KIND(str);
4478 data = PyUnicode_DATA(str);
4479 len = PyUnicode_GET_LENGTH(str);
4480
4481 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004482 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004483
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004484 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004485 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004486 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004487 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004488 if (v == NULL)
4489 return NULL;
4490
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004491 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004492 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004493 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004494
Antoine Pitrou244651a2009-05-04 18:56:13 +00004495 if (inShift) {
4496 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4497 /* shifting out */
4498 if (base64bits) { /* output remaining bits */
4499 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4500 base64buffer = 0;
4501 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004502 }
4503 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004504 /* Characters not in the BASE64 set implicitly unshift the sequence
4505 so no '-' is required, except if the character is itself a '-' */
4506 if (IS_BASE64(ch) || ch == '-') {
4507 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004508 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004509 *out++ = (char) ch;
4510 }
4511 else {
4512 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004513 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004514 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004515 else { /* not in a shift sequence */
4516 if (ch == '+') {
4517 *out++ = '+';
4518 *out++ = '-';
4519 }
4520 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4521 *out++ = (char) ch;
4522 }
4523 else {
4524 *out++ = '+';
4525 inShift = 1;
4526 goto encode_char;
4527 }
4528 }
4529 continue;
4530encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004531 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004532 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004533
Antoine Pitrou244651a2009-05-04 18:56:13 +00004534 /* code first surrogate */
4535 base64bits += 16;
Victor Stinner76df43d2012-10-30 01:42:39 +01004536 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004537 while (base64bits >= 6) {
4538 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4539 base64bits -= 6;
4540 }
4541 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004542 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004543 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004544 base64bits += 16;
4545 base64buffer = (base64buffer << 16) | ch;
4546 while (base64bits >= 6) {
4547 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4548 base64bits -= 6;
4549 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004550 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004551 if (base64bits)
4552 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4553 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004554 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004555 if (_PyBytes_Resize(&v, out - start) < 0)
4556 return NULL;
4557 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004558}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004559PyObject *
4560PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4561 Py_ssize_t size,
4562 int base64SetO,
4563 int base64WhiteSpace,
4564 const char *errors)
4565{
4566 PyObject *result;
4567 PyObject *tmp = PyUnicode_FromUnicode(s, size);
4568 if (tmp == NULL)
4569 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004570 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004571 base64WhiteSpace, errors);
4572 Py_DECREF(tmp);
4573 return result;
4574}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004575
Antoine Pitrou244651a2009-05-04 18:56:13 +00004576#undef IS_BASE64
4577#undef FROM_BASE64
4578#undef TO_BASE64
4579#undef DECODE_DIRECT
4580#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004581
Guido van Rossumd57fd912000-03-10 22:53:23 +00004582/* --- UTF-8 Codec -------------------------------------------------------- */
4583
Alexander Belopolsky40018472011-02-26 01:02:56 +00004584PyObject *
4585PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004586 Py_ssize_t size,
4587 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004588{
Walter Dörwald69652032004-09-07 20:24:22 +00004589 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4590}
4591
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004592#include "stringlib/asciilib.h"
4593#include "stringlib/codecs.h"
4594#include "stringlib/undef.h"
4595
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004596#include "stringlib/ucs1lib.h"
4597#include "stringlib/codecs.h"
4598#include "stringlib/undef.h"
4599
4600#include "stringlib/ucs2lib.h"
4601#include "stringlib/codecs.h"
4602#include "stringlib/undef.h"
4603
4604#include "stringlib/ucs4lib.h"
4605#include "stringlib/codecs.h"
4606#include "stringlib/undef.h"
4607
Antoine Pitrouab868312009-01-10 15:40:25 +00004608/* Mask to quickly check whether a C 'long' contains a
4609 non-ASCII, UTF8-encoded char. */
4610#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004611# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004612#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004613# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004614#else
4615# error C 'long' size should be either 4 or 8!
4616#endif
4617
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004618static Py_ssize_t
4619ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004620{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004621 const char *p = start;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004622 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004623
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004624 /*
4625 * Issue #17237: m68k is a bit different from most architectures in
4626 * that objects do not use "natural alignment" - for example, int and
4627 * long are only aligned at 2-byte boundaries. Therefore the assert()
4628 * won't work; also, tests have shown that skipping the "optimised
4629 * version" will even speed up m68k.
4630 */
4631#if !defined(__m68k__)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004632#if SIZEOF_LONG <= SIZEOF_VOID_P
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004633 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4634 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004635 /* Fast path, see in STRINGLIB(utf8_decode) for
4636 an explanation. */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004637 /* Help allocation */
4638 const char *_p = p;
4639 Py_UCS1 * q = dest;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004640 while (_p < aligned_end) {
4641 unsigned long value = *(const unsigned long *) _p;
4642 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00004643 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004644 *((unsigned long *)q) = value;
4645 _p += SIZEOF_LONG;
4646 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004647 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004648 p = _p;
4649 while (p < end) {
4650 if ((unsigned char)*p & 0x80)
4651 break;
4652 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004653 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004654 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004655 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004656#endif
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004657#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004658 while (p < end) {
4659 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4660 for an explanation. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004661 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004662 /* Help allocation */
4663 const char *_p = p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004664 while (_p < aligned_end) {
4665 unsigned long value = *(unsigned long *) _p;
4666 if (value & ASCII_CHAR_MASK)
4667 break;
4668 _p += SIZEOF_LONG;
4669 }
4670 p = _p;
4671 if (_p == end)
4672 break;
4673 }
4674 if ((unsigned char)*p & 0x80)
4675 break;
4676 ++p;
4677 }
4678 memcpy(dest, start, p - start);
4679 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004680}
Antoine Pitrouab868312009-01-10 15:40:25 +00004681
Victor Stinner785938e2011-12-11 20:09:03 +01004682PyObject *
4683PyUnicode_DecodeUTF8Stateful(const char *s,
4684 Py_ssize_t size,
4685 const char *errors,
4686 Py_ssize_t *consumed)
4687{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004688 _PyUnicodeWriter writer;
Victor Stinner785938e2011-12-11 20:09:03 +01004689 const char *starts = s;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004690 const char *end = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004691
4692 Py_ssize_t startinpos;
4693 Py_ssize_t endinpos;
4694 const char *errmsg = "";
4695 PyObject *errorHandler = NULL;
4696 PyObject *exc = NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004697
4698 if (size == 0) {
4699 if (consumed)
4700 *consumed = 0;
Serhiy Storchaka678db842013-01-26 12:16:36 +02004701 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner785938e2011-12-11 20:09:03 +01004702 }
4703
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004704 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4705 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner785938e2011-12-11 20:09:03 +01004706 if (consumed)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004707 *consumed = 1;
4708 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01004709 }
4710
Victor Stinner8f674cc2013-04-17 23:02:17 +02004711 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02004712 writer.min_length = size;
4713 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004714 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004715
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004716 writer.pos = ascii_decode(s, end, writer.data);
4717 s += writer.pos;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004718 while (s < end) {
4719 Py_UCS4 ch;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004720 int kind = writer.kind;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004721 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004722 if (PyUnicode_IS_ASCII(writer.buffer))
4723 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004724 else
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004725 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004726 } else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004727 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004728 } else {
4729 assert(kind == PyUnicode_4BYTE_KIND);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004730 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004731 }
4732
4733 switch (ch) {
4734 case 0:
4735 if (s == end || consumed)
4736 goto End;
4737 errmsg = "unexpected end of data";
4738 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004739 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004740 break;
4741 case 1:
4742 errmsg = "invalid start byte";
4743 startinpos = s - starts;
4744 endinpos = startinpos + 1;
4745 break;
4746 case 2:
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004747 case 3:
4748 case 4:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004749 errmsg = "invalid continuation byte";
4750 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004751 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004752 break;
4753 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004754 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004755 goto onError;
4756 continue;
4757 }
4758
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004759 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004760 errors, &errorHandler,
4761 "utf-8", errmsg,
4762 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004763 &writer))
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004764 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004765 }
4766
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004767End:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004768 if (consumed)
4769 *consumed = s - starts;
4770
4771 Py_XDECREF(errorHandler);
4772 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004773 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004774
4775onError:
4776 Py_XDECREF(errorHandler);
4777 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004778 _PyUnicodeWriter_Dealloc(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004779 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004780}
4781
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004782#ifdef __APPLE__
4783
4784/* Simplified UTF-8 decoder using surrogateescape error handler,
Victor Stinner0d92c4f2012-11-12 23:32:21 +01004785 used to decode the command line arguments on Mac OS X.
4786
4787 Return a pointer to a newly allocated wide character string (use
Victor Stinner6f8eeee2013-07-07 22:57:45 +02004788 PyMem_RawFree() to free the memory), or NULL on memory allocation error. */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004789
4790wchar_t*
4791_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4792{
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004793 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004794 wchar_t *unicode;
4795 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004796
4797 /* Note: size will always be longer than the resulting Unicode
4798 character count */
Victor Stinnerf50e1872015-03-20 11:32:24 +01004799 if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1))
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004800 return NULL;
Victor Stinner6f8eeee2013-07-07 22:57:45 +02004801 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004802 if (!unicode)
4803 return NULL;
4804
4805 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004806 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004807 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004808 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004809 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004810#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004811 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004812#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004813 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004814#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004815 if (ch > 0xFF) {
4816#if SIZEOF_WCHAR_T == 4
4817 assert(0);
4818#else
Serhiy Storchakab6266432016-11-12 14:28:06 +02004819 assert(ch > 0xFFFF && ch <= MAX_UNICODE);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004820 /* compute and append the two surrogates: */
4821 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
4822 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
4823#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004824 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004825 else {
4826 if (!ch && s == e)
4827 break;
4828 /* surrogateescape */
4829 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
4830 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004831 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004832 unicode[outpos] = L'\0';
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004833 return unicode;
4834}
4835
4836#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00004837
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004838/* Primary internal function which creates utf8 encoded bytes objects.
4839
4840 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00004841 and allocate exactly as much space needed at the end. Else allocate the
4842 maximum possible needed (4 result bytes per Unicode character), and return
4843 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004844*/
Tim Peters7e3d9612002-04-21 03:26:37 +00004845PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01004846_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004847{
Victor Stinner6099a032011-12-18 14:22:26 +01004848 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004849 void *data;
4850 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00004851
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004852 if (!PyUnicode_Check(unicode)) {
4853 PyErr_BadArgument();
4854 return NULL;
4855 }
4856
4857 if (PyUnicode_READY(unicode) == -1)
4858 return NULL;
4859
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004860 if (PyUnicode_UTF8(unicode))
4861 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4862 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004863
4864 kind = PyUnicode_KIND(unicode);
4865 data = PyUnicode_DATA(unicode);
4866 size = PyUnicode_GET_LENGTH(unicode);
4867
Benjamin Petersonead6b532011-12-20 17:23:42 -06004868 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01004869 default:
4870 assert(0);
4871 case PyUnicode_1BYTE_KIND:
4872 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
4873 assert(!PyUnicode_IS_ASCII(unicode));
4874 return ucs1lib_utf8_encoder(unicode, data, size, errors);
4875 case PyUnicode_2BYTE_KIND:
4876 return ucs2lib_utf8_encoder(unicode, data, size, errors);
4877 case PyUnicode_4BYTE_KIND:
4878 return ucs4lib_utf8_encoder(unicode, data, size, errors);
Tim Peters602f7402002-04-27 18:03:26 +00004879 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004880}
4881
Alexander Belopolsky40018472011-02-26 01:02:56 +00004882PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004883PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4884 Py_ssize_t size,
4885 const char *errors)
4886{
4887 PyObject *v, *unicode;
4888
4889 unicode = PyUnicode_FromUnicode(s, size);
4890 if (unicode == NULL)
4891 return NULL;
4892 v = _PyUnicode_AsUTF8String(unicode, errors);
4893 Py_DECREF(unicode);
4894 return v;
4895}
4896
4897PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004898PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004899{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004900 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004901}
4902
Walter Dörwald41980ca2007-08-16 21:55:45 +00004903/* --- UTF-32 Codec ------------------------------------------------------- */
4904
4905PyObject *
4906PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004907 Py_ssize_t size,
4908 const char *errors,
4909 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004910{
4911 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4912}
4913
4914PyObject *
4915PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004916 Py_ssize_t size,
4917 const char *errors,
4918 int *byteorder,
4919 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004920{
4921 const char *starts = s;
4922 Py_ssize_t startinpos;
4923 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004924 _PyUnicodeWriter writer;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004925 const unsigned char *q, *e;
Victor Stinnere64322e2012-10-30 23:12:47 +01004926 int le, bo = 0; /* assume native ordering by default */
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02004927 const char *encoding;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004928 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004929 PyObject *errorHandler = NULL;
4930 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004931
Walter Dörwald41980ca2007-08-16 21:55:45 +00004932 q = (unsigned char *)s;
4933 e = q + size;
4934
4935 if (byteorder)
4936 bo = *byteorder;
4937
4938 /* Check for BOM marks (U+FEFF) in the input and adjust current
4939 byte order setting accordingly. In native mode, the leading BOM
4940 mark is skipped, in all other modes, it is copied to the output
4941 stream as-is (giving a ZWNBSP character). */
Victor Stinnere64322e2012-10-30 23:12:47 +01004942 if (bo == 0 && size >= 4) {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07004943 Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01004944 if (bom == 0x0000FEFF) {
4945 bo = -1;
4946 q += 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00004947 }
Victor Stinnere64322e2012-10-30 23:12:47 +01004948 else if (bom == 0xFFFE0000) {
4949 bo = 1;
4950 q += 4;
4951 }
4952 if (byteorder)
4953 *byteorder = bo;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004954 }
4955
Victor Stinnere64322e2012-10-30 23:12:47 +01004956 if (q == e) {
4957 if (consumed)
4958 *consumed = size;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004959 _Py_RETURN_UNICODE_EMPTY();
Walter Dörwald41980ca2007-08-16 21:55:45 +00004960 }
4961
Victor Stinnere64322e2012-10-30 23:12:47 +01004962#ifdef WORDS_BIGENDIAN
4963 le = bo < 0;
4964#else
4965 le = bo <= 0;
4966#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02004967 encoding = le ? "utf-32-le" : "utf-32-be";
Victor Stinnere64322e2012-10-30 23:12:47 +01004968
Victor Stinner8f674cc2013-04-17 23:02:17 +02004969 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02004970 writer.min_length = (e - q + 3) / 4;
4971 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004972 goto onError;
Victor Stinnere64322e2012-10-30 23:12:47 +01004973
Victor Stinnere64322e2012-10-30 23:12:47 +01004974 while (1) {
4975 Py_UCS4 ch = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004976 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004977
Victor Stinnere64322e2012-10-30 23:12:47 +01004978 if (e - q >= 4) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004979 enum PyUnicode_Kind kind = writer.kind;
4980 void *data = writer.data;
Victor Stinnere64322e2012-10-30 23:12:47 +01004981 const unsigned char *last = e - 4;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004982 Py_ssize_t pos = writer.pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01004983 if (le) {
4984 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07004985 ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01004986 if (ch > maxch)
4987 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02004988 if (kind != PyUnicode_1BYTE_KIND &&
4989 Py_UNICODE_IS_SURROGATE(ch))
4990 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004991 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01004992 q += 4;
4993 } while (q <= last);
4994 }
4995 else {
4996 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07004997 ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
Victor Stinnere64322e2012-10-30 23:12:47 +01004998 if (ch > maxch)
4999 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005000 if (kind != PyUnicode_1BYTE_KIND &&
5001 Py_UNICODE_IS_SURROGATE(ch))
5002 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005003 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005004 q += 4;
5005 } while (q <= last);
5006 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005007 writer.pos = pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005008 }
5009
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005010 if (Py_UNICODE_IS_SURROGATE(ch)) {
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005011 errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005012 startinpos = ((const char *)q) - starts;
5013 endinpos = startinpos + 4;
5014 }
5015 else if (ch <= maxch) {
Victor Stinnere64322e2012-10-30 23:12:47 +01005016 if (q == e || consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005017 break;
Victor Stinnere64322e2012-10-30 23:12:47 +01005018 /* remaining bytes at the end? (size should be divisible by 4) */
Benjamin Peterson29060642009-01-31 22:14:21 +00005019 errmsg = "truncated data";
Victor Stinnere64322e2012-10-30 23:12:47 +01005020 startinpos = ((const char *)q) - starts;
5021 endinpos = ((const char *)e) - starts;
Benjamin Peterson29060642009-01-31 22:14:21 +00005022 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005023 else {
5024 if (ch < 0x110000) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005025 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnere64322e2012-10-30 23:12:47 +01005026 goto onError;
5027 q += 4;
5028 continue;
5029 }
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005030 errmsg = "code point not in range(0x110000)";
Victor Stinnere64322e2012-10-30 23:12:47 +01005031 startinpos = ((const char *)q) - starts;
5032 endinpos = startinpos + 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005033 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005034
5035 /* The remaining input chars are ignored if the callback
5036 chooses to skip the input */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005037 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005038 errors, &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005039 encoding, errmsg,
Benjamin Peterson29060642009-01-31 22:14:21 +00005040 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005041 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005042 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005043 }
5044
Walter Dörwald41980ca2007-08-16 21:55:45 +00005045 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005046 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005047
Walter Dörwald41980ca2007-08-16 21:55:45 +00005048 Py_XDECREF(errorHandler);
5049 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005050 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005051
Benjamin Peterson29060642009-01-31 22:14:21 +00005052 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005053 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005054 Py_XDECREF(errorHandler);
5055 Py_XDECREF(exc);
5056 return NULL;
5057}
5058
5059PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005060_PyUnicode_EncodeUTF32(PyObject *str,
5061 const char *errors,
5062 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005063{
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005064 enum PyUnicode_Kind kind;
5065 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005066 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005067 PyObject *v;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005068 PY_UINT32_T *out;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005069#if PY_LITTLE_ENDIAN
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005070 int native_ordering = byteorder <= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005071#else
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005072 int native_ordering = byteorder >= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005073#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005074 const char *encoding;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005075 Py_ssize_t nsize, pos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005076 PyObject *errorHandler = NULL;
5077 PyObject *exc = NULL;
5078 PyObject *rep = NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005079
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005080 if (!PyUnicode_Check(str)) {
5081 PyErr_BadArgument();
5082 return NULL;
5083 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005084 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005085 return NULL;
5086 kind = PyUnicode_KIND(str);
5087 data = PyUnicode_DATA(str);
5088 len = PyUnicode_GET_LENGTH(str);
5089
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005090 if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
Serhiy Storchaka30793282014-01-04 22:44:01 +02005091 return PyErr_NoMemory();
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005092 nsize = len + (byteorder == 0);
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005093 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005094 if (v == NULL)
5095 return NULL;
5096
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005097 /* output buffer is 4-bytes aligned */
5098 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
5099 out = (PY_UINT32_T *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005100 if (byteorder == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005101 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005102 if (len == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005103 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005104
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005105 if (byteorder == -1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005106 encoding = "utf-32-le";
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005107 else if (byteorder == 1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005108 encoding = "utf-32-be";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005109 else
5110 encoding = "utf-32";
5111
5112 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005113 ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5114 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005115 }
5116
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005117 pos = 0;
5118 while (pos < len) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005119 Py_ssize_t repsize, moreunits;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005120
5121 if (kind == PyUnicode_2BYTE_KIND) {
5122 pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5123 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005124 }
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005125 else {
5126 assert(kind == PyUnicode_4BYTE_KIND);
5127 pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5128 &out, native_ordering);
5129 }
5130 if (pos == len)
5131 break;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005132
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005133 rep = unicode_encode_call_errorhandler(
5134 errors, &errorHandler,
5135 encoding, "surrogates not allowed",
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005136 str, &exc, pos, pos + 1, &pos);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005137 if (!rep)
5138 goto error;
5139
5140 if (PyBytes_Check(rep)) {
5141 repsize = PyBytes_GET_SIZE(rep);
5142 if (repsize & 3) {
5143 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005144 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005145 "surrogates not allowed");
5146 goto error;
5147 }
5148 moreunits = repsize / 4;
5149 }
5150 else {
5151 assert(PyUnicode_Check(rep));
5152 if (PyUnicode_READY(rep) < 0)
5153 goto error;
5154 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5155 if (!PyUnicode_IS_ASCII(rep)) {
5156 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005157 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005158 "surrogates not allowed");
5159 goto error;
5160 }
5161 }
5162
5163 /* four bytes are reserved for each surrogate */
5164 if (moreunits > 1) {
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005165 Py_ssize_t outpos = out - (PY_UINT32_T*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005166 Py_ssize_t morebytes = 4 * (moreunits - 1);
5167 if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) {
5168 /* integer overflow */
5169 PyErr_NoMemory();
5170 goto error;
5171 }
5172 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0)
5173 goto error;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005174 out = (PY_UINT32_T*) PyBytes_AS_STRING(v) + outpos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005175 }
5176
5177 if (PyBytes_Check(rep)) {
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005178 Py_MEMCPY(out, PyBytes_AS_STRING(rep), repsize);
5179 out += moreunits;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005180 } else /* rep is unicode */ {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005181 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005182 ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5183 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005184 }
5185
5186 Py_CLEAR(rep);
5187 }
5188
5189 /* Cut back to size actually needed. This is necessary for, for example,
5190 encoding of a string containing isolated surrogates and the 'ignore'
5191 handler is used. */
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005192 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005193 if (nsize != PyBytes_GET_SIZE(v))
5194 _PyBytes_Resize(&v, nsize);
5195 Py_XDECREF(errorHandler);
5196 Py_XDECREF(exc);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005197 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005198 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005199 error:
5200 Py_XDECREF(rep);
5201 Py_XDECREF(errorHandler);
5202 Py_XDECREF(exc);
5203 Py_XDECREF(v);
5204 return NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005205}
5206
Alexander Belopolsky40018472011-02-26 01:02:56 +00005207PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005208PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5209 Py_ssize_t size,
5210 const char *errors,
5211 int byteorder)
5212{
5213 PyObject *result;
5214 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5215 if (tmp == NULL)
5216 return NULL;
5217 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5218 Py_DECREF(tmp);
5219 return result;
5220}
5221
5222PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005223PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005224{
Victor Stinnerb960b342011-11-20 19:12:52 +01005225 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005226}
5227
Guido van Rossumd57fd912000-03-10 22:53:23 +00005228/* --- UTF-16 Codec ------------------------------------------------------- */
5229
Tim Peters772747b2001-08-09 22:21:55 +00005230PyObject *
5231PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005232 Py_ssize_t size,
5233 const char *errors,
5234 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005235{
Walter Dörwald69652032004-09-07 20:24:22 +00005236 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5237}
5238
5239PyObject *
5240PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005241 Py_ssize_t size,
5242 const char *errors,
5243 int *byteorder,
5244 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005245{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005246 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005247 Py_ssize_t startinpos;
5248 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005249 _PyUnicodeWriter writer;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005250 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005251 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005252 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005253 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005254 PyObject *errorHandler = NULL;
5255 PyObject *exc = NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005256 const char *encoding;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005257
Tim Peters772747b2001-08-09 22:21:55 +00005258 q = (unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005259 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005260
5261 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005262 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005263
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005264 /* Check for BOM marks (U+FEFF) in the input and adjust current
5265 byte order setting accordingly. In native mode, the leading BOM
5266 mark is skipped, in all other modes, it is copied to the output
5267 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005268 if (bo == 0 && size >= 2) {
5269 const Py_UCS4 bom = (q[1] << 8) | q[0];
5270 if (bom == 0xFEFF) {
5271 q += 2;
5272 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005273 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005274 else if (bom == 0xFFFE) {
5275 q += 2;
5276 bo = 1;
5277 }
5278 if (byteorder)
5279 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005280 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005281
Antoine Pitrou63065d72012-05-15 23:48:04 +02005282 if (q == e) {
5283 if (consumed)
5284 *consumed = size;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005285 _Py_RETURN_UNICODE_EMPTY();
Tim Peters772747b2001-08-09 22:21:55 +00005286 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005287
Christian Heimes743e0cd2012-10-17 23:52:17 +02005288#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02005289 native_ordering = bo <= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005290 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
Antoine Pitrouab868312009-01-10 15:40:25 +00005291#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02005292 native_ordering = bo >= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005293 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
Antoine Pitrouab868312009-01-10 15:40:25 +00005294#endif
Tim Peters772747b2001-08-09 22:21:55 +00005295
Antoine Pitrou63065d72012-05-15 23:48:04 +02005296 /* Note: size will always be longer than the resulting Unicode
5297 character count */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005298 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005299 writer.min_length = (e - q + 1) / 2;
5300 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005301 goto onError;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005302
Antoine Pitrou63065d72012-05-15 23:48:04 +02005303 while (1) {
5304 Py_UCS4 ch = 0;
5305 if (e - q >= 2) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005306 int kind = writer.kind;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005307 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005308 if (PyUnicode_IS_ASCII(writer.buffer))
Antoine Pitrou63065d72012-05-15 23:48:04 +02005309 ch = asciilib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005310 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005311 native_ordering);
5312 else
5313 ch = ucs1lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005314 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005315 native_ordering);
5316 } else if (kind == PyUnicode_2BYTE_KIND) {
5317 ch = ucs2lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005318 (Py_UCS2*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005319 native_ordering);
5320 } else {
5321 assert(kind == PyUnicode_4BYTE_KIND);
5322 ch = ucs4lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005323 (Py_UCS4*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005324 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00005325 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005326 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005327
Antoine Pitrou63065d72012-05-15 23:48:04 +02005328 switch (ch)
5329 {
5330 case 0:
5331 /* remaining byte at the end? (size should be even) */
5332 if (q == e || consumed)
5333 goto End;
5334 errmsg = "truncated data";
5335 startinpos = ((const char *)q) - starts;
5336 endinpos = ((const char *)e) - starts;
5337 break;
5338 /* The remaining input chars are ignored if the callback
5339 chooses to skip the input */
5340 case 1:
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005341 q -= 2;
5342 if (consumed)
Serhiy Storchakaae3b32a2013-01-08 23:40:52 +02005343 goto End;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005344 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005345 startinpos = ((const char *)q) - starts;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005346 endinpos = ((const char *)e) - starts;
5347 break;
5348 case 2:
5349 errmsg = "illegal encoding";
5350 startinpos = ((const char *)q) - 2 - starts;
5351 endinpos = startinpos + 2;
5352 break;
5353 case 3:
5354 errmsg = "illegal UTF-16 surrogate";
5355 startinpos = ((const char *)q) - 4 - starts;
5356 endinpos = startinpos + 2;
5357 break;
5358 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005359 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005360 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005361 continue;
5362 }
5363
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005364 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouab868312009-01-10 15:40:25 +00005365 errors,
5366 &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005367 encoding, errmsg,
Antoine Pitrouab868312009-01-10 15:40:25 +00005368 &starts,
5369 (const char **)&e,
5370 &startinpos,
5371 &endinpos,
5372 &exc,
5373 (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005374 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005375 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005376 }
5377
Antoine Pitrou63065d72012-05-15 23:48:04 +02005378End:
Walter Dörwald69652032004-09-07 20:24:22 +00005379 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005380 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005381
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005382 Py_XDECREF(errorHandler);
5383 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005384 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005385
Benjamin Peterson29060642009-01-31 22:14:21 +00005386 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005387 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005388 Py_XDECREF(errorHandler);
5389 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005390 return NULL;
5391}
5392
Tim Peters772747b2001-08-09 22:21:55 +00005393PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005394_PyUnicode_EncodeUTF16(PyObject *str,
5395 const char *errors,
5396 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005397{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005398 enum PyUnicode_Kind kind;
5399 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005400 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005401 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005402 unsigned short *out;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005403 Py_ssize_t pairs;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005404#if PY_BIG_ENDIAN
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005405 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005406#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005407 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005408#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005409 const char *encoding;
5410 Py_ssize_t nsize, pos;
5411 PyObject *errorHandler = NULL;
5412 PyObject *exc = NULL;
5413 PyObject *rep = NULL;
Tim Peters772747b2001-08-09 22:21:55 +00005414
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005415 if (!PyUnicode_Check(str)) {
5416 PyErr_BadArgument();
5417 return NULL;
5418 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005419 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005420 return NULL;
5421 kind = PyUnicode_KIND(str);
5422 data = PyUnicode_DATA(str);
5423 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005424
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005425 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005426 if (kind == PyUnicode_4BYTE_KIND) {
5427 const Py_UCS4 *in = (const Py_UCS4 *)data;
5428 const Py_UCS4 *end = in + len;
5429 while (in < end)
5430 if (*in++ >= 0x10000)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005431 pairs++;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005432 }
5433 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005434 return PyErr_NoMemory();
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005435 nsize = len + pairs + (byteorder == 0);
5436 v = PyBytes_FromStringAndSize(NULL, nsize * 2);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005437 if (v == NULL)
5438 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005439
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005440 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02005441 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005442 out = (unsigned short *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005443 if (byteorder == 0)
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005444 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005445 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005446 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005447
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005448 if (kind == PyUnicode_1BYTE_KIND) {
5449 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5450 goto done;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005451 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005452
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005453 if (byteorder < 0)
5454 encoding = "utf-16-le";
5455 else if (byteorder > 0)
5456 encoding = "utf-16-be";
5457 else
5458 encoding = "utf-16";
5459
5460 pos = 0;
5461 while (pos < len) {
5462 Py_ssize_t repsize, moreunits;
5463
5464 if (kind == PyUnicode_2BYTE_KIND) {
5465 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
5466 &out, native_ordering);
5467 }
5468 else {
5469 assert(kind == PyUnicode_4BYTE_KIND);
5470 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
5471 &out, native_ordering);
5472 }
5473 if (pos == len)
5474 break;
5475
5476 rep = unicode_encode_call_errorhandler(
5477 errors, &errorHandler,
5478 encoding, "surrogates not allowed",
5479 str, &exc, pos, pos + 1, &pos);
5480 if (!rep)
5481 goto error;
5482
5483 if (PyBytes_Check(rep)) {
5484 repsize = PyBytes_GET_SIZE(rep);
5485 if (repsize & 1) {
5486 raise_encode_exception(&exc, encoding,
5487 str, pos - 1, pos,
5488 "surrogates not allowed");
5489 goto error;
5490 }
5491 moreunits = repsize / 2;
5492 }
5493 else {
5494 assert(PyUnicode_Check(rep));
5495 if (PyUnicode_READY(rep) < 0)
5496 goto error;
5497 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5498 if (!PyUnicode_IS_ASCII(rep)) {
5499 raise_encode_exception(&exc, encoding,
5500 str, pos - 1, pos,
5501 "surrogates not allowed");
5502 goto error;
5503 }
5504 }
5505
5506 /* two bytes are reserved for each surrogate */
5507 if (moreunits > 1) {
5508 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
5509 Py_ssize_t morebytes = 2 * (moreunits - 1);
5510 if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) {
5511 /* integer overflow */
5512 PyErr_NoMemory();
5513 goto error;
5514 }
5515 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0)
5516 goto error;
5517 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
5518 }
5519
5520 if (PyBytes_Check(rep)) {
5521 Py_MEMCPY(out, PyBytes_AS_STRING(rep), repsize);
5522 out += moreunits;
5523 } else /* rep is unicode */ {
5524 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
5525 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5526 &out, native_ordering);
5527 }
5528
5529 Py_CLEAR(rep);
5530 }
5531
5532 /* Cut back to size actually needed. This is necessary for, for example,
5533 encoding of a string containing isolated surrogates and the 'ignore' handler
5534 is used. */
5535 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
5536 if (nsize != PyBytes_GET_SIZE(v))
5537 _PyBytes_Resize(&v, nsize);
5538 Py_XDECREF(errorHandler);
5539 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00005540 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005541 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005542 error:
5543 Py_XDECREF(rep);
5544 Py_XDECREF(errorHandler);
5545 Py_XDECREF(exc);
5546 Py_XDECREF(v);
5547 return NULL;
5548#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005549}
5550
Alexander Belopolsky40018472011-02-26 01:02:56 +00005551PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005552PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5553 Py_ssize_t size,
5554 const char *errors,
5555 int byteorder)
5556{
5557 PyObject *result;
5558 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5559 if (tmp == NULL)
5560 return NULL;
5561 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5562 Py_DECREF(tmp);
5563 return result;
5564}
5565
5566PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005567PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005568{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005569 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005570}
5571
5572/* --- Unicode Escape Codec ----------------------------------------------- */
5573
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005574/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5575 if all the escapes in the string make it still a valid ASCII string.
5576 Returns -1 if any escapes were found which cause the string to
5577 pop out of ASCII range. Otherwise returns the length of the
5578 required buffer to hold the string.
5579 */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02005580static Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005581length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5582{
5583 const unsigned char *p = (const unsigned char *)s;
5584 const unsigned char *end = p + size;
5585 Py_ssize_t length = 0;
5586
5587 if (size < 0)
5588 return -1;
5589
5590 for (; p < end; ++p) {
5591 if (*p > 127) {
5592 /* Non-ASCII */
5593 return -1;
5594 }
5595 else if (*p != '\\') {
5596 /* Normal character */
5597 ++length;
5598 }
5599 else {
5600 /* Backslash-escape, check next char */
5601 ++p;
5602 /* Escape sequence reaches till end of string or
5603 non-ASCII follow-up. */
5604 if (p >= end || *p > 127)
5605 return -1;
5606 switch (*p) {
5607 case '\n':
5608 /* backslash + \n result in zero characters */
5609 break;
5610 case '\\': case '\'': case '\"':
5611 case 'b': case 'f': case 't':
5612 case 'n': case 'r': case 'v': case 'a':
5613 ++length;
5614 break;
5615 case '0': case '1': case '2': case '3':
5616 case '4': case '5': case '6': case '7':
5617 case 'x': case 'u': case 'U': case 'N':
5618 /* these do not guarantee ASCII characters */
5619 return -1;
5620 default:
5621 /* count the backslash + the other character */
5622 length += 2;
5623 }
5624 }
5625 }
5626 return length;
5627}
5628
Fredrik Lundh06d12682001-01-24 07:59:11 +00005629static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005630
Alexander Belopolsky40018472011-02-26 01:02:56 +00005631PyObject *
5632PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005633 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005634 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005635{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005636 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005637 Py_ssize_t startinpos;
5638 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005639 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005640 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005641 char* message;
5642 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005643 PyObject *errorHandler = NULL;
5644 PyObject *exc = NULL;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005645 Py_ssize_t len;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005646
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005647 len = length_of_escaped_ascii_string(s, size);
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005648 if (len == 0)
5649 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005650
5651 /* After length_of_escaped_ascii_string() there are two alternatives,
5652 either the string is pure ASCII with named escapes like \n, etc.
5653 and we determined it's exact size (common case)
5654 or it contains \x, \u, ... escape sequences. then we create a
5655 legacy wchar string and resize it at the end of this function. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005656 _PyUnicodeWriter_Init(&writer);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005657 if (len > 0) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02005658 writer.min_length = len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005659 }
5660 else {
5661 /* Escaped strings will always be longer than the resulting
5662 Unicode string, so we start with size here and then reduce the
5663 length after conversion to the true value.
5664 (but if the error callback returns a long replacement string
5665 we'll have to allocate more space) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005666 writer.min_length = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005667 }
5668
Guido van Rossumd57fd912000-03-10 22:53:23 +00005669 if (size == 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005670 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005671 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005672
Guido van Rossumd57fd912000-03-10 22:53:23 +00005673 while (s < end) {
5674 unsigned char c;
Victor Stinner24729f32011-11-10 20:31:37 +01005675 Py_UCS4 x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005676 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005677
5678 /* Non-escape characters are interpreted as Unicode ordinals */
5679 if (*s != '\\') {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005680 x = (unsigned char)*s;
5681 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005682 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005683 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005684 continue;
5685 }
5686
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005687 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005688 /* \ - Escapes */
5689 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005690 c = *s++;
5691 if (s > end)
5692 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005693
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005694 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005695
Benjamin Peterson29060642009-01-31 22:14:21 +00005696 /* \x escapes */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005697#define WRITECHAR(ch) \
5698 do { \
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005699 if (_PyUnicodeWriter_WriteCharInline(&writer, (ch)) < 0) \
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005700 goto onError; \
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005701 } while(0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005702
Guido van Rossumd57fd912000-03-10 22:53:23 +00005703 case '\n': break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005704 case '\\': WRITECHAR('\\'); break;
5705 case '\'': WRITECHAR('\''); break;
5706 case '\"': WRITECHAR('\"'); break;
5707 case 'b': WRITECHAR('\b'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005708 /* FF */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005709 case 'f': WRITECHAR('\014'); break;
5710 case 't': WRITECHAR('\t'); break;
5711 case 'n': WRITECHAR('\n'); break;
5712 case 'r': WRITECHAR('\r'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005713 /* VT */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005714 case 'v': WRITECHAR('\013'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005715 /* BEL, not classic C */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005716 case 'a': WRITECHAR('\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005717
Benjamin Peterson29060642009-01-31 22:14:21 +00005718 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005719 case '0': case '1': case '2': case '3':
5720 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005721 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005722 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005723 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005724 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005725 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005726 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005727 WRITECHAR(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005728 break;
5729
Benjamin Peterson29060642009-01-31 22:14:21 +00005730 /* hex escapes */
5731 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005732 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005733 digits = 2;
5734 message = "truncated \\xXX escape";
5735 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005736
Benjamin Peterson29060642009-01-31 22:14:21 +00005737 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005738 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005739 digits = 4;
5740 message = "truncated \\uXXXX escape";
5741 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005742
Benjamin Peterson29060642009-01-31 22:14:21 +00005743 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005744 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005745 digits = 8;
5746 message = "truncated \\UXXXXXXXX escape";
5747 hexescape:
5748 chr = 0;
Serhiy Storchakad6793772013-01-29 10:20:44 +02005749 if (end - s < digits) {
5750 /* count only hex digits */
5751 for (; s < end; ++s) {
5752 c = (unsigned char)*s;
5753 if (!Py_ISXDIGIT(c))
5754 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005755 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02005756 goto error;
5757 }
5758 for (; digits--; ++s) {
5759 c = (unsigned char)*s;
5760 if (!Py_ISXDIGIT(c))
5761 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005762 chr = (chr<<4) & ~0xF;
5763 if (c >= '0' && c <= '9')
5764 chr += c - '0';
5765 else if (c >= 'a' && c <= 'f')
5766 chr += 10 + c - 'a';
5767 else
5768 chr += 10 + c - 'A';
5769 }
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005770 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005771 /* _decoding_error will have already written into the
5772 target buffer. */
5773 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005774 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005775 /* when we get here, chr is a 32-bit unicode character */
Serhiy Storchaka24193de2013-01-29 10:28:07 +02005776 message = "illegal Unicode character";
5777 if (chr > MAX_UNICODE)
Serhiy Storchakad6793772013-01-29 10:20:44 +02005778 goto error;
Serhiy Storchaka24193de2013-01-29 10:28:07 +02005779 WRITECHAR(chr);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005780 break;
5781
Benjamin Peterson29060642009-01-31 22:14:21 +00005782 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005783 case 'N':
5784 message = "malformed \\N character escape";
5785 if (ucnhash_CAPI == NULL) {
5786 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005787 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5788 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005789 if (ucnhash_CAPI == NULL)
5790 goto ucnhashError;
5791 }
5792 if (*s == '{') {
5793 const char *start = s+1;
5794 /* look for the closing brace */
5795 while (*s != '}' && s < end)
5796 s++;
5797 if (s > start && s < end && *s == '}') {
5798 /* found a name. look it up in the unicode database */
5799 message = "unknown Unicode character name";
5800 s++;
Serhiy Storchaka4f5f0e52013-01-21 11:38:00 +02005801 if (s - start - 1 <= INT_MAX &&
Serhiy Storchakac35f3a92013-01-21 11:42:57 +02005802 ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
Ezio Melotti931b8aa2011-10-21 21:57:36 +03005803 &chr, 0))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005804 goto store;
5805 }
5806 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02005807 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005808
5809 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005810 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005811 message = "\\ at end of string";
5812 s--;
Serhiy Storchakad6793772013-01-29 10:20:44 +02005813 goto error;
Walter Dörwald8c077222002-03-25 11:16:18 +00005814 }
5815 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005816 WRITECHAR('\\');
Serhiy Storchaka73e38802013-01-25 23:52:21 +02005817 WRITECHAR((unsigned char)s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005818 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005819 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005820 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02005821 continue;
5822
5823 error:
5824 endinpos = s-starts;
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02005825 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchakad6793772013-01-29 10:20:44 +02005826 errors, &errorHandler,
5827 "unicodeescape", message,
5828 &starts, &end, &startinpos, &endinpos, &exc, &s,
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02005829 &writer))
Serhiy Storchakad6793772013-01-29 10:20:44 +02005830 goto onError;
5831 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005832 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005833#undef WRITECHAR
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005834
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005835 Py_XDECREF(errorHandler);
5836 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005837 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald8c077222002-03-25 11:16:18 +00005838
Benjamin Peterson29060642009-01-31 22:14:21 +00005839 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005840 PyErr_SetString(
5841 PyExc_UnicodeError,
5842 "\\N escapes not supported (can't load unicodedata module)"
5843 );
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005844 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005845 Py_XDECREF(errorHandler);
5846 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005847 return NULL;
5848
Benjamin Peterson29060642009-01-31 22:14:21 +00005849 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005850 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005851 Py_XDECREF(errorHandler);
5852 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005853 return NULL;
5854}
5855
5856/* Return a Unicode-Escape string version of the Unicode object.
5857
5858 If quotes is true, the string is enclosed in u"" or u'' quotes as
5859 appropriate.
5860
5861*/
5862
Alexander Belopolsky40018472011-02-26 01:02:56 +00005863PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005864PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005865{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005866 Py_ssize_t i, len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005867 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005868 char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005869 int kind;
5870 void *data;
5871 Py_ssize_t expandsize = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005872
Ezio Melottie7f90372012-10-05 03:33:31 +03005873 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00005874 escape.
5875
Ezio Melottie7f90372012-10-05 03:33:31 +03005876 For UCS1 strings it's '\xxx', 4 bytes per source character.
5877 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
5878 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00005879 */
5880
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005881 if (!PyUnicode_Check(unicode)) {
5882 PyErr_BadArgument();
5883 return NULL;
5884 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005885 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005886 return NULL;
5887 len = PyUnicode_GET_LENGTH(unicode);
5888 kind = PyUnicode_KIND(unicode);
5889 data = PyUnicode_DATA(unicode);
Benjamin Petersonead6b532011-12-20 17:23:42 -06005890 switch (kind) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005891 case PyUnicode_1BYTE_KIND: expandsize = 4; break;
5892 case PyUnicode_2BYTE_KIND: expandsize = 6; break;
5893 case PyUnicode_4BYTE_KIND: expandsize = 10; break;
5894 }
5895
5896 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005897 return PyBytes_FromStringAndSize(NULL, 0);
5898
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005899 if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005900 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005901
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005902 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005903 2
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005904 + expandsize*len
Benjamin Peterson29060642009-01-31 22:14:21 +00005905 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005906 if (repr == NULL)
5907 return NULL;
5908
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005909 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005910
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005911 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01005912 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005913
Walter Dörwald79e913e2007-05-12 11:08:06 +00005914 /* Escape backslashes */
5915 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005916 *p++ = '\\';
5917 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005918 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005919 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005920
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005921 /* Map 21-bit characters to '\U00xxxxxx' */
5922 else if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01005923 assert(ch <= MAX_UNICODE);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005924 *p++ = '\\';
5925 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005926 *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
5927 *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F];
5928 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
5929 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
5930 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
5931 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
5932 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
5933 *p++ = Py_hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00005934 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005935 }
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005936
Guido van Rossumd57fd912000-03-10 22:53:23 +00005937 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005938 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005939 *p++ = '\\';
5940 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005941 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
5942 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
5943 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5944 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005945 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005946
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005947 /* Map special whitespace to '\t', \n', '\r' */
5948 else if (ch == '\t') {
5949 *p++ = '\\';
5950 *p++ = 't';
5951 }
5952 else if (ch == '\n') {
5953 *p++ = '\\';
5954 *p++ = 'n';
5955 }
5956 else if (ch == '\r') {
5957 *p++ = '\\';
5958 *p++ = 'r';
5959 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005960
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005961 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00005962 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005963 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005964 *p++ = 'x';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005965 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5966 *p++ = Py_hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00005967 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005968
Guido van Rossumd57fd912000-03-10 22:53:23 +00005969 /* Copy everything else as-is */
5970 else
5971 *p++ = (char) ch;
5972 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005973
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005974 assert(p - PyBytes_AS_STRING(repr) > 0);
5975 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
5976 return NULL;
5977 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005978}
5979
Alexander Belopolsky40018472011-02-26 01:02:56 +00005980PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005981PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
5982 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005983{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005984 PyObject *result;
5985 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5986 if (tmp == NULL)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005987 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005988 result = PyUnicode_AsUnicodeEscapeString(tmp);
5989 Py_DECREF(tmp);
5990 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005991}
5992
5993/* --- Raw Unicode Escape Codec ------------------------------------------- */
5994
Alexander Belopolsky40018472011-02-26 01:02:56 +00005995PyObject *
5996PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005997 Py_ssize_t size,
5998 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005999{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006000 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006001 Py_ssize_t startinpos;
6002 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006003 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006004 const char *end;
6005 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006006 PyObject *errorHandler = NULL;
6007 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006008
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006009 if (size == 0)
6010 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006011
Guido van Rossumd57fd912000-03-10 22:53:23 +00006012 /* Escaped strings will always be longer than the resulting
6013 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006014 length after conversion to the true value. (But decoding error
6015 handler might have to resize the string) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006016 _PyUnicodeWriter_Init(&writer);
6017 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006018
Guido van Rossumd57fd912000-03-10 22:53:23 +00006019 end = s + size;
6020 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006021 unsigned char c;
6022 Py_UCS4 x;
6023 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006024 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006025
Benjamin Peterson29060642009-01-31 22:14:21 +00006026 /* Non-escape characters are interpreted as Unicode ordinals */
6027 if (*s != '\\') {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006028 x = (unsigned char)*s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006029 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006030 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006031 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006032 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006033 startinpos = s-starts;
6034
6035 /* \u-escapes are only interpreted iff the number of leading
6036 backslashes if odd */
6037 bs = s;
6038 for (;s < end;) {
6039 if (*s != '\\')
6040 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006041 x = (unsigned char)*s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006042 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006043 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006044 }
6045 if (((s - bs) & 1) == 0 ||
6046 s >= end ||
6047 (*s != 'u' && *s != 'U')) {
6048 continue;
6049 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006050 writer.pos--;
Benjamin Peterson29060642009-01-31 22:14:21 +00006051 count = *s=='u' ? 4 : 8;
6052 s++;
6053
6054 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Benjamin Peterson29060642009-01-31 22:14:21 +00006055 for (x = 0, i = 0; i < count; ++i, ++s) {
6056 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00006057 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006058 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006059 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00006060 errors, &errorHandler,
6061 "rawunicodeescape", "truncated \\uXXXX",
6062 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006063 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006064 goto onError;
6065 goto nextByte;
6066 }
6067 x = (x<<4) & ~0xF;
6068 if (c >= '0' && c <= '9')
6069 x += c - '0';
6070 else if (c >= 'a' && c <= 'f')
6071 x += 10 + c - 'a';
6072 else
6073 x += 10 + c - 'A';
6074 }
Victor Stinner8faf8212011-12-08 22:14:11 +01006075 if (x <= MAX_UNICODE) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006076 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006077 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006078 }
6079 else {
Christian Heimesfe337bf2008-03-23 21:54:12 +00006080 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006081 if (unicode_decode_call_errorhandler_writer(
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006082 errors, &errorHandler,
6083 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00006084 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006085 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006086 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006087 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006088 nextByte:
6089 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006090 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006091 Py_XDECREF(errorHandler);
6092 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006093 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006094
Benjamin Peterson29060642009-01-31 22:14:21 +00006095 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006096 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006097 Py_XDECREF(errorHandler);
6098 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006099 return NULL;
6100}
6101
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006102
Alexander Belopolsky40018472011-02-26 01:02:56 +00006103PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006104PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006105{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006106 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006107 char *p;
6108 char *q;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006109 Py_ssize_t expandsize, pos;
6110 int kind;
6111 void *data;
6112 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006113
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006114 if (!PyUnicode_Check(unicode)) {
6115 PyErr_BadArgument();
6116 return NULL;
6117 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05006118 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006119 return NULL;
6120 kind = PyUnicode_KIND(unicode);
6121 data = PyUnicode_DATA(unicode);
6122 len = PyUnicode_GET_LENGTH(unicode);
Benjamin Peterson1518e872011-11-23 10:44:52 -06006123 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6124 bytes, and 1 byte characters 4. */
6125 expandsize = kind * 2 + 2;
Victor Stinner0e368262011-11-10 20:12:49 +01006126
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006127 if (len > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006128 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00006129
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006130 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006131 if (repr == NULL)
6132 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006133 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006134 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006135
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006136 p = q = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006137 for (pos = 0; pos < len; pos++) {
6138 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006139 /* Map 32-bit characters to '\Uxxxxxxxx' */
6140 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01006141 assert(ch <= MAX_UNICODE);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006142 *p++ = '\\';
6143 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006144 *p++ = Py_hexdigits[(ch >> 28) & 0xf];
6145 *p++ = Py_hexdigits[(ch >> 24) & 0xf];
6146 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6147 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6148 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6149 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6150 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6151 *p++ = Py_hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00006152 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006153 /* Map 16-bit characters to '\uxxxx' */
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006154 else if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006155 *p++ = '\\';
6156 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006157 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6158 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6159 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6160 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006161 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006162 /* Copy everything else as-is */
6163 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00006164 *p++ = (char) ch;
6165 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006166
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006167 assert(p > q);
6168 if (_PyBytes_Resize(&repr, p - q) < 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006169 return NULL;
6170 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006171}
6172
Alexander Belopolsky40018472011-02-26 01:02:56 +00006173PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006174PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6175 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006176{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006177 PyObject *result;
6178 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6179 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006180 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006181 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6182 Py_DECREF(tmp);
6183 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006184}
6185
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006186/* --- Unicode Internal Codec ------------------------------------------- */
6187
Alexander Belopolsky40018472011-02-26 01:02:56 +00006188PyObject *
6189_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006190 Py_ssize_t size,
6191 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006192{
6193 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006194 Py_ssize_t startinpos;
6195 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006196 _PyUnicodeWriter writer;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006197 const char *end;
6198 const char *reason;
6199 PyObject *errorHandler = NULL;
6200 PyObject *exc = NULL;
6201
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006202 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02006203 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006204 1))
6205 return NULL;
6206
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006207 if (size == 0)
6208 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006209
Victor Stinner8f674cc2013-04-17 23:02:17 +02006210 _PyUnicodeWriter_Init(&writer);
6211 if (size / Py_UNICODE_SIZE > PY_SSIZE_T_MAX - 1) {
6212 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00006213 goto onError;
Victor Stinner8f674cc2013-04-17 23:02:17 +02006214 }
6215 writer.min_length = (size + (Py_UNICODE_SIZE - 1)) / Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006216
Victor Stinner8f674cc2013-04-17 23:02:17 +02006217 end = s + size;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006218 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006219 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006220 Py_UCS4 ch;
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006221 if (end - s < Py_UNICODE_SIZE) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006222 endinpos = end-starts;
6223 reason = "truncated input";
6224 goto error;
6225 }
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006226 /* We copy the raw representation one byte at a time because the
6227 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006228 ((char *) &uch)[0] = s[0];
6229 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006230#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006231 ((char *) &uch)[2] = s[2];
6232 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006233#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006234 ch = uch;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006235#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006236 /* We have to sanity check the raw data, otherwise doom looms for
6237 some malformed UCS-4 data. */
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006238 if (ch > 0x10ffff) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006239 endinpos = s - starts + Py_UNICODE_SIZE;
6240 reason = "illegal code point (> 0x10FFFF)";
6241 goto error;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006242 }
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006243#endif
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006244 s += Py_UNICODE_SIZE;
6245#ifndef Py_UNICODE_WIDE
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006246 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && end - s >= Py_UNICODE_SIZE)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006247 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006248 Py_UNICODE uch2;
6249 ((char *) &uch2)[0] = s[0];
6250 ((char *) &uch2)[1] = s[1];
Victor Stinner551ac952011-11-29 22:58:13 +01006251 if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006252 {
Victor Stinner551ac952011-11-29 22:58:13 +01006253 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006254 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006255 }
6256 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006257#endif
6258
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006259 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006260 goto onError;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006261 continue;
6262
6263 error:
6264 startinpos = s - starts;
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006265 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006266 errors, &errorHandler,
6267 "unicode_internal", reason,
6268 &starts, &end, &startinpos, &endinpos, &exc, &s,
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006269 &writer))
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006270 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006271 }
6272
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006273 Py_XDECREF(errorHandler);
6274 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006275 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006276
Benjamin Peterson29060642009-01-31 22:14:21 +00006277 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006278 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006279 Py_XDECREF(errorHandler);
6280 Py_XDECREF(exc);
6281 return NULL;
6282}
6283
Guido van Rossumd57fd912000-03-10 22:53:23 +00006284/* --- Latin-1 Codec ------------------------------------------------------ */
6285
Alexander Belopolsky40018472011-02-26 01:02:56 +00006286PyObject *
6287PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006288 Py_ssize_t size,
6289 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006290{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006291 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006292 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006293}
6294
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006295/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006296static void
6297make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006298 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006299 PyObject *unicode,
6300 Py_ssize_t startpos, Py_ssize_t endpos,
6301 const char *reason)
6302{
6303 if (*exceptionObject == NULL) {
6304 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006305 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006306 encoding, unicode, startpos, endpos, reason);
6307 }
6308 else {
6309 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6310 goto onError;
6311 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6312 goto onError;
6313 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6314 goto onError;
6315 return;
6316 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02006317 Py_CLEAR(*exceptionObject);
Martin v. Löwis9e816682011-11-02 12:45:42 +01006318 }
6319}
6320
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006321/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006322static void
6323raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006324 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006325 PyObject *unicode,
6326 Py_ssize_t startpos, Py_ssize_t endpos,
6327 const char *reason)
6328{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006329 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006330 encoding, unicode, startpos, endpos, reason);
6331 if (*exceptionObject != NULL)
6332 PyCodec_StrictErrors(*exceptionObject);
6333}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006334
6335/* error handling callback helper:
6336 build arguments, call the callback and check the arguments,
6337 put the result into newpos and return the replacement string, which
6338 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006339static PyObject *
6340unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006341 PyObject **errorHandler,
6342 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006343 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006344 Py_ssize_t startpos, Py_ssize_t endpos,
6345 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006346{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006347 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006348 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006349 PyObject *restuple;
6350 PyObject *resunicode;
6351
6352 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006353 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006354 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006355 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006356 }
6357
Benjamin Petersonbac79492012-01-14 13:34:47 -05006358 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006359 return NULL;
6360 len = PyUnicode_GET_LENGTH(unicode);
6361
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006362 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006363 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006364 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006365 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006366
6367 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006368 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006369 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006370 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006371 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006372 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006373 Py_DECREF(restuple);
6374 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006375 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006376 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006377 &resunicode, newpos)) {
6378 Py_DECREF(restuple);
6379 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006380 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006381 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6382 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6383 Py_DECREF(restuple);
6384 return NULL;
6385 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006386 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006387 *newpos = len + *newpos;
6388 if (*newpos<0 || *newpos>len) {
Victor Stinnera33bce02014-07-04 22:47:46 +02006389 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006390 Py_DECREF(restuple);
6391 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006392 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006393 Py_INCREF(resunicode);
6394 Py_DECREF(restuple);
6395 return resunicode;
6396}
6397
Alexander Belopolsky40018472011-02-26 01:02:56 +00006398static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006399unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006400 const char *errors,
Victor Stinnerfcd96532011-11-04 00:28:50 +01006401 unsigned int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006402{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006403 /* input state */
6404 Py_ssize_t pos=0, size;
6405 int kind;
6406 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006407 /* output object */
6408 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006409 /* pointer into the output */
6410 char *str;
6411 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006412 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006413 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6414 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006415 PyObject *errorHandler = NULL;
6416 PyObject *exc = NULL;
6417 /* the following variable is used for caching string comparisons
6418 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6419 int known_errorHandler = -1;
6420
Benjamin Petersonbac79492012-01-14 13:34:47 -05006421 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006422 return NULL;
6423 size = PyUnicode_GET_LENGTH(unicode);
6424 kind = PyUnicode_KIND(unicode);
6425 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006426 /* allocate enough for a simple encoding without
6427 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006428 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006429 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006430 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006431 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006432 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006433 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006434 ressize = size;
6435
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006436 while (pos < size) {
6437 Py_UCS4 c = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006438
Benjamin Peterson29060642009-01-31 22:14:21 +00006439 /* can we encode this? */
6440 if (c<limit) {
6441 /* no overflow check, because we know that the space is enough */
6442 *str++ = (char)c;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006443 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006444 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006445 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006446 Py_ssize_t requiredsize;
6447 PyObject *repunicode;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006448 Py_ssize_t repsize, newpos, respos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006449 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006450 Py_ssize_t collstart = pos;
6451 Py_ssize_t collend = pos;
Benjamin Peterson29060642009-01-31 22:14:21 +00006452 /* find all unecodable characters */
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006453 while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006454 ++collend;
6455 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6456 if (known_errorHandler==-1) {
6457 if ((errors==NULL) || (!strcmp(errors, "strict")))
6458 known_errorHandler = 1;
6459 else if (!strcmp(errors, "replace"))
6460 known_errorHandler = 2;
6461 else if (!strcmp(errors, "ignore"))
6462 known_errorHandler = 3;
6463 else if (!strcmp(errors, "xmlcharrefreplace"))
6464 known_errorHandler = 4;
6465 else
6466 known_errorHandler = 0;
6467 }
6468 switch (known_errorHandler) {
6469 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006470 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006471 goto onError;
6472 case 2: /* replace */
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006473 while (collstart++ < collend)
Benjamin Peterson29060642009-01-31 22:14:21 +00006474 *str++ = '?'; /* fall through */
6475 case 3: /* ignore */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006476 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006477 break;
6478 case 4: /* xmlcharrefreplace */
6479 respos = str - PyBytes_AS_STRING(res);
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006480 requiredsize = respos;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006481 /* determine replacement size */
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006482 for (i = collstart; i < collend; ++i) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006483 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006484 Py_ssize_t incr;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006485 if (ch < 10)
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006486 incr = 2+1+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006487 else if (ch < 100)
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006488 incr = 2+2+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006489 else if (ch < 1000)
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006490 incr = 2+3+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006491 else if (ch < 10000)
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006492 incr = 2+4+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006493 else if (ch < 100000)
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006494 incr = 2+5+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006495 else if (ch < 1000000)
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006496 incr = 2+6+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006497 else {
Victor Stinner8faf8212011-12-08 22:14:11 +01006498 assert(ch <= MAX_UNICODE);
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006499 incr = 2+7+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006500 }
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006501 if (requiredsize > PY_SSIZE_T_MAX - incr)
6502 goto overflow;
6503 requiredsize += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +00006504 }
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006505 if (requiredsize > PY_SSIZE_T_MAX - (size - collend))
6506 goto overflow;
6507 requiredsize += size - collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006508 if (requiredsize > ressize) {
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006509 if (ressize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*ressize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006510 requiredsize = 2*ressize;
6511 if (_PyBytes_Resize(&res, requiredsize))
6512 goto onError;
6513 str = PyBytes_AS_STRING(res) + respos;
6514 ressize = requiredsize;
6515 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006516 /* generate replacement */
6517 for (i = collstart; i < collend; ++i) {
6518 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
Benjamin Peterson29060642009-01-31 22:14:21 +00006519 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006520 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006521 break;
6522 default:
6523 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006524 encoding, reason, unicode, &exc,
6525 collstart, collend, &newpos);
6526 if (repunicode == NULL || (PyUnicode_Check(repunicode) &&
Benjamin Petersonbac79492012-01-14 13:34:47 -05006527 PyUnicode_READY(repunicode) == -1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006528 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006529 if (PyBytes_Check(repunicode)) {
6530 /* Directly copy bytes result to output. */
6531 repsize = PyBytes_Size(repunicode);
6532 if (repsize > 1) {
6533 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006534 respos = str - PyBytes_AS_STRING(res);
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006535 if (ressize > PY_SSIZE_T_MAX - repsize - 1) {
6536 Py_DECREF(repunicode);
6537 goto overflow;
6538 }
Martin v. Löwis011e8422009-05-05 04:43:17 +00006539 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6540 Py_DECREF(repunicode);
6541 goto onError;
6542 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006543 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006544 ressize += repsize-1;
6545 }
6546 memcpy(str, PyBytes_AsString(repunicode), repsize);
6547 str += repsize;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006548 pos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006549 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006550 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006551 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006552 /* need more space? (at least enough for what we
6553 have+the replacement+the rest of the string, so
6554 we won't have to check space for encodable characters) */
6555 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006556 repsize = PyUnicode_GET_LENGTH(repunicode);
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006557 requiredsize = respos;
6558 if (requiredsize > PY_SSIZE_T_MAX - repsize)
6559 goto overflow;
6560 requiredsize += repsize;
6561 if (requiredsize > PY_SSIZE_T_MAX - (size - collend))
6562 goto overflow;
6563 requiredsize += size - collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006564 if (requiredsize > ressize) {
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006565 if (ressize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*ressize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006566 requiredsize = 2*ressize;
6567 if (_PyBytes_Resize(&res, requiredsize)) {
6568 Py_DECREF(repunicode);
6569 goto onError;
6570 }
6571 str = PyBytes_AS_STRING(res) + respos;
6572 ressize = requiredsize;
6573 }
6574 /* check if there is anything unencodable in the replacement
6575 and copy it to the output */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006576 for (i = 0; repsize-->0; ++i, ++str) {
6577 c = PyUnicode_READ_CHAR(repunicode, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00006578 if (c >= limit) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006579 raise_encode_exception(&exc, encoding, unicode,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006580 pos, pos+1, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006581 Py_DECREF(repunicode);
6582 goto onError;
6583 }
6584 *str = (char)c;
6585 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006586 pos = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006587 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006588 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006589 }
6590 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006591 /* Resize if we allocated to much */
6592 size = str - PyBytes_AS_STRING(res);
6593 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006594 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006595 if (_PyBytes_Resize(&res, size) < 0)
6596 goto onError;
6597 }
6598
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006599 Py_XDECREF(errorHandler);
6600 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006601 return res;
6602
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006603 overflow:
6604 PyErr_SetString(PyExc_OverflowError,
6605 "encoded result is too long for a Python string");
6606
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006607 onError:
6608 Py_XDECREF(res);
6609 Py_XDECREF(errorHandler);
6610 Py_XDECREF(exc);
6611 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006612}
6613
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006614/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006615PyObject *
6616PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006617 Py_ssize_t size,
6618 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006619{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006620 PyObject *result;
6621 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6622 if (unicode == NULL)
6623 return NULL;
6624 result = unicode_encode_ucs1(unicode, errors, 256);
6625 Py_DECREF(unicode);
6626 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006627}
6628
Alexander Belopolsky40018472011-02-26 01:02:56 +00006629PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006630_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006631{
6632 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006633 PyErr_BadArgument();
6634 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006635 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006636 if (PyUnicode_READY(unicode) == -1)
6637 return NULL;
6638 /* Fast path: if it is a one-byte string, construct
6639 bytes object directly. */
6640 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6641 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6642 PyUnicode_GET_LENGTH(unicode));
6643 /* Non-Latin-1 characters present. Defer to above function to
6644 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006645 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006646}
6647
6648PyObject*
6649PyUnicode_AsLatin1String(PyObject *unicode)
6650{
6651 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006652}
6653
6654/* --- 7-bit ASCII Codec -------------------------------------------------- */
6655
Alexander Belopolsky40018472011-02-26 01:02:56 +00006656PyObject *
6657PyUnicode_DecodeASCII(const char *s,
6658 Py_ssize_t size,
6659 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006660{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006661 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006662 _PyUnicodeWriter writer;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006663 int kind;
6664 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006665 Py_ssize_t startinpos;
6666 Py_ssize_t endinpos;
6667 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006668 const char *e;
6669 PyObject *errorHandler = NULL;
6670 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006671
Guido van Rossumd57fd912000-03-10 22:53:23 +00006672 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02006673 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006674
Guido van Rossumd57fd912000-03-10 22:53:23 +00006675 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006676 if (size == 1 && (unsigned char)s[0] < 128)
6677 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006678
Victor Stinner8f674cc2013-04-17 23:02:17 +02006679 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02006680 writer.min_length = size;
6681 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0)
Victor Stinner8f674cc2013-04-17 23:02:17 +02006682 return NULL;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006683
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006684 e = s + size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006685 data = writer.data;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006686 outpos = ascii_decode(s, e, (Py_UCS1 *)data);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006687 writer.pos = outpos;
6688 if (writer.pos == size)
6689 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006690
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006691 s += writer.pos;
6692 kind = writer.kind;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006693 while (s < e) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02006694 unsigned char c = (unsigned char)*s;
Benjamin Peterson29060642009-01-31 22:14:21 +00006695 if (c < 128) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006696 PyUnicode_WRITE(kind, data, writer.pos, c);
6697 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00006698 ++s;
6699 }
6700 else {
6701 startinpos = s-starts;
6702 endinpos = startinpos + 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006703 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00006704 errors, &errorHandler,
6705 "ascii", "ordinal not in range(128)",
6706 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006707 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006708 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006709 kind = writer.kind;
6710 data = writer.data;
Benjamin Peterson29060642009-01-31 22:14:21 +00006711 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006712 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006713 Py_XDECREF(errorHandler);
6714 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006715 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006716
Benjamin Peterson29060642009-01-31 22:14:21 +00006717 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006718 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006719 Py_XDECREF(errorHandler);
6720 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006721 return NULL;
6722}
6723
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006724/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006725PyObject *
6726PyUnicode_EncodeASCII(const Py_UNICODE *p,
6727 Py_ssize_t size,
6728 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006729{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006730 PyObject *result;
6731 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6732 if (unicode == NULL)
6733 return NULL;
6734 result = unicode_encode_ucs1(unicode, errors, 128);
6735 Py_DECREF(unicode);
6736 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006737}
6738
Alexander Belopolsky40018472011-02-26 01:02:56 +00006739PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006740_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006741{
6742 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006743 PyErr_BadArgument();
6744 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006745 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006746 if (PyUnicode_READY(unicode) == -1)
6747 return NULL;
6748 /* Fast path: if it is an ASCII-only string, construct bytes object
6749 directly. Else defer to above function to raise the exception. */
Victor Stinneraf037572013-04-14 18:44:10 +02006750 if (PyUnicode_IS_ASCII(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006751 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6752 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006753 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006754}
6755
6756PyObject *
6757PyUnicode_AsASCIIString(PyObject *unicode)
6758{
6759 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006760}
6761
Victor Stinner99b95382011-07-04 14:23:54 +02006762#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006763
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006764/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006765
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006766#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006767#define NEED_RETRY
6768#endif
6769
Victor Stinner3a50e702011-10-18 21:21:00 +02006770#ifndef WC_ERR_INVALID_CHARS
6771# define WC_ERR_INVALID_CHARS 0x0080
6772#endif
6773
6774static char*
6775code_page_name(UINT code_page, PyObject **obj)
6776{
6777 *obj = NULL;
6778 if (code_page == CP_ACP)
6779 return "mbcs";
6780 if (code_page == CP_UTF7)
6781 return "CP_UTF7";
6782 if (code_page == CP_UTF8)
6783 return "CP_UTF8";
6784
6785 *obj = PyBytes_FromFormat("cp%u", code_page);
6786 if (*obj == NULL)
6787 return NULL;
6788 return PyBytes_AS_STRING(*obj);
6789}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006790
Victor Stinner3a50e702011-10-18 21:21:00 +02006791static DWORD
6792decode_code_page_flags(UINT code_page)
6793{
6794 if (code_page == CP_UTF7) {
6795 /* The CP_UTF7 decoder only supports flags=0 */
6796 return 0;
6797 }
6798 else
6799 return MB_ERR_INVALID_CHARS;
6800}
6801
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006802/*
Victor Stinner3a50e702011-10-18 21:21:00 +02006803 * Decode a byte string from a Windows code page into unicode object in strict
6804 * mode.
6805 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02006806 * Returns consumed size if succeed, returns -2 on decode error, or raise an
6807 * OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006808 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006809static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006810decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006811 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02006812 const char *in,
6813 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006814{
Victor Stinner3a50e702011-10-18 21:21:00 +02006815 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01006816 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02006817 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006818
6819 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02006820 assert(insize > 0);
6821 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
6822 if (outsize <= 0)
6823 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006824
6825 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006826 /* Create unicode object */
Victor Stinnerab595942011-12-17 04:59:06 +01006827 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01006828 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00006829 if (*v == NULL)
6830 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006831 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006832 }
6833 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006834 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02006835 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner16e6a802011-12-12 13:24:15 +01006836 if (unicode_resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006837 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006838 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006839 }
6840
6841 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02006842 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
6843 if (outsize <= 0)
6844 goto error;
6845 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00006846
Victor Stinner3a50e702011-10-18 21:21:00 +02006847error:
6848 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
6849 return -2;
6850 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00006851 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006852}
6853
Victor Stinner3a50e702011-10-18 21:21:00 +02006854/*
6855 * Decode a byte string from a code page into unicode object with an error
6856 * handler.
6857 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02006858 * Returns consumed size if succeed, or raise an OSError or
Victor Stinner3a50e702011-10-18 21:21:00 +02006859 * UnicodeDecodeError exception and returns -1 on error.
6860 */
6861static int
6862decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006863 PyObject **v,
6864 const char *in, const int size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01006865 const char *errors, int final)
Victor Stinner3a50e702011-10-18 21:21:00 +02006866{
6867 const char *startin = in;
6868 const char *endin = in + size;
6869 const DWORD flags = decode_code_page_flags(code_page);
6870 /* Ideally, we should get reason from FormatMessage. This is the Windows
6871 2000 English version of the message. */
6872 const char *reason = "No mapping for the Unicode character exists "
6873 "in the target code page.";
6874 /* each step cannot decode more than 1 character, but a character can be
6875 represented as a surrogate pair */
6876 wchar_t buffer[2], *startout, *out;
Victor Stinner9f067f42013-06-05 00:21:31 +02006877 int insize;
6878 Py_ssize_t outsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02006879 PyObject *errorHandler = NULL;
6880 PyObject *exc = NULL;
6881 PyObject *encoding_obj = NULL;
6882 char *encoding;
6883 DWORD err;
6884 int ret = -1;
6885
6886 assert(size > 0);
6887
6888 encoding = code_page_name(code_page, &encoding_obj);
6889 if (encoding == NULL)
6890 return -1;
6891
Victor Stinner7d00cc12014-03-17 23:08:06 +01006892 if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
Victor Stinner3a50e702011-10-18 21:21:00 +02006893 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
6894 UnicodeDecodeError. */
6895 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
6896 if (exc != NULL) {
6897 PyCodec_StrictErrors(exc);
6898 Py_CLEAR(exc);
6899 }
6900 goto error;
6901 }
6902
6903 if (*v == NULL) {
6904 /* Create unicode object */
6905 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6906 PyErr_NoMemory();
6907 goto error;
6908 }
Victor Stinnerab595942011-12-17 04:59:06 +01006909 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01006910 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02006911 if (*v == NULL)
6912 goto error;
6913 startout = PyUnicode_AS_UNICODE(*v);
6914 }
6915 else {
6916 /* Extend unicode object */
6917 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
6918 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6919 PyErr_NoMemory();
6920 goto error;
6921 }
Victor Stinner16e6a802011-12-12 13:24:15 +01006922 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02006923 goto error;
6924 startout = PyUnicode_AS_UNICODE(*v) + n;
6925 }
6926
6927 /* Decode the byte string character per character */
6928 out = startout;
6929 while (in < endin)
6930 {
6931 /* Decode a character */
6932 insize = 1;
6933 do
6934 {
6935 outsize = MultiByteToWideChar(code_page, flags,
6936 in, insize,
6937 buffer, Py_ARRAY_LENGTH(buffer));
6938 if (outsize > 0)
6939 break;
6940 err = GetLastError();
6941 if (err != ERROR_NO_UNICODE_TRANSLATION
6942 && err != ERROR_INSUFFICIENT_BUFFER)
6943 {
6944 PyErr_SetFromWindowsErr(0);
6945 goto error;
6946 }
6947 insize++;
6948 }
6949 /* 4=maximum length of a UTF-8 sequence */
6950 while (insize <= 4 && (in + insize) <= endin);
6951
6952 if (outsize <= 0) {
6953 Py_ssize_t startinpos, endinpos, outpos;
6954
Victor Stinner7d00cc12014-03-17 23:08:06 +01006955 /* last character in partial decode? */
6956 if (in + insize >= endin && !final)
6957 break;
6958
Victor Stinner3a50e702011-10-18 21:21:00 +02006959 startinpos = in - startin;
6960 endinpos = startinpos + 1;
6961 outpos = out - PyUnicode_AS_UNICODE(*v);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006962 if (unicode_decode_call_errorhandler_wchar(
Victor Stinner3a50e702011-10-18 21:21:00 +02006963 errors, &errorHandler,
6964 encoding, reason,
6965 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01006966 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02006967 {
6968 goto error;
6969 }
Victor Stinner596a6c42011-11-09 00:02:18 +01006970 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02006971 }
6972 else {
6973 in += insize;
6974 memcpy(out, buffer, outsize * sizeof(wchar_t));
6975 out += outsize;
6976 }
6977 }
6978
6979 /* write a NUL character at the end */
6980 *out = 0;
6981
6982 /* Extend unicode object */
6983 outsize = out - startout;
6984 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner16e6a802011-12-12 13:24:15 +01006985 if (unicode_resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02006986 goto error;
Victor Stinnere1f17c62014-07-25 14:03:03 +02006987 /* (in - startin) <= size and size is an int */
6988 ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
Victor Stinner3a50e702011-10-18 21:21:00 +02006989
6990error:
6991 Py_XDECREF(encoding_obj);
6992 Py_XDECREF(errorHandler);
6993 Py_XDECREF(exc);
6994 return ret;
6995}
6996
Victor Stinner3a50e702011-10-18 21:21:00 +02006997static PyObject *
6998decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006999 const char *s, Py_ssize_t size,
7000 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007001{
Victor Stinner76a31a62011-11-04 00:05:13 +01007002 PyObject *v = NULL;
7003 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007004
Victor Stinner3a50e702011-10-18 21:21:00 +02007005 if (code_page < 0) {
7006 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7007 return NULL;
7008 }
7009
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007010 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007011 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007012
Victor Stinner76a31a62011-11-04 00:05:13 +01007013 do
7014 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007015#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01007016 if (size > INT_MAX) {
7017 chunk_size = INT_MAX;
7018 final = 0;
7019 done = 0;
7020 }
7021 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007022#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007023 {
7024 chunk_size = (int)size;
7025 final = (consumed == NULL);
7026 done = 1;
7027 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007028
Victor Stinner76a31a62011-11-04 00:05:13 +01007029 if (chunk_size == 0 && done) {
7030 if (v != NULL)
7031 break;
Serhiy Storchaka678db842013-01-26 12:16:36 +02007032 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner76a31a62011-11-04 00:05:13 +01007033 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007034
Victor Stinner76a31a62011-11-04 00:05:13 +01007035 converted = decode_code_page_strict(code_page, &v,
7036 s, chunk_size);
7037 if (converted == -2)
7038 converted = decode_code_page_errors(code_page, &v,
7039 s, chunk_size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007040 errors, final);
7041 assert(converted != 0 || done);
Victor Stinner76a31a62011-11-04 00:05:13 +01007042
7043 if (converted < 0) {
7044 Py_XDECREF(v);
7045 return NULL;
7046 }
7047
7048 if (consumed)
7049 *consumed += converted;
7050
7051 s += converted;
7052 size -= converted;
7053 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007054
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007055 return unicode_result(v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007056}
7057
Alexander Belopolsky40018472011-02-26 01:02:56 +00007058PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007059PyUnicode_DecodeCodePageStateful(int code_page,
7060 const char *s,
7061 Py_ssize_t size,
7062 const char *errors,
7063 Py_ssize_t *consumed)
7064{
7065 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7066}
7067
7068PyObject *
7069PyUnicode_DecodeMBCSStateful(const char *s,
7070 Py_ssize_t size,
7071 const char *errors,
7072 Py_ssize_t *consumed)
7073{
7074 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7075}
7076
7077PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007078PyUnicode_DecodeMBCS(const char *s,
7079 Py_ssize_t size,
7080 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007081{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007082 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7083}
7084
Victor Stinner3a50e702011-10-18 21:21:00 +02007085static DWORD
7086encode_code_page_flags(UINT code_page, const char *errors)
7087{
7088 if (code_page == CP_UTF8) {
Steve Dower3e96f322015-03-02 08:01:10 -08007089 return WC_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007090 }
7091 else if (code_page == CP_UTF7) {
7092 /* CP_UTF7 only supports flags=0 */
7093 return 0;
7094 }
7095 else {
7096 if (errors != NULL && strcmp(errors, "replace") == 0)
7097 return 0;
7098 else
7099 return WC_NO_BEST_FIT_CHARS;
7100 }
7101}
7102
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007103/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007104 * Encode a Unicode string to a Windows code page into a byte string in strict
7105 * mode.
7106 *
7107 * Returns consumed characters if succeed, returns -2 on encode error, or raise
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007108 * an OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007109 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007110static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007111encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007112 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007113 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007114{
Victor Stinner554f3f02010-06-16 23:33:54 +00007115 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007116 BOOL *pusedDefaultChar = &usedDefaultChar;
7117 int outsize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007118 PyObject *exc = NULL;
Victor Stinner24729f32011-11-10 20:31:37 +01007119 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007120 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007121 const DWORD flags = encode_code_page_flags(code_page, NULL);
7122 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007123 /* Create a substring so that we can get the UTF-16 representation
7124 of just the slice under consideration. */
7125 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007126
Martin v. Löwis3d325192011-11-04 18:23:06 +01007127 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007128
Victor Stinner3a50e702011-10-18 21:21:00 +02007129 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007130 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007131 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007132 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007133
Victor Stinner2fc507f2011-11-04 20:06:39 +01007134 substring = PyUnicode_Substring(unicode, offset, offset+len);
7135 if (substring == NULL)
7136 return -1;
7137 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7138 if (p == NULL) {
7139 Py_DECREF(substring);
7140 return -1;
7141 }
Victor Stinner9f067f42013-06-05 00:21:31 +02007142 assert(size <= INT_MAX);
Martin v. Löwis3d325192011-11-04 18:23:06 +01007143
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007144 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007145 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007146 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007147 NULL, 0,
7148 NULL, pusedDefaultChar);
7149 if (outsize <= 0)
7150 goto error;
7151 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007152 if (pusedDefaultChar && *pusedDefaultChar) {
7153 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007154 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007155 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007156
Victor Stinner3a50e702011-10-18 21:21:00 +02007157 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007158 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007159 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007160 if (*outbytes == NULL) {
7161 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007162 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007163 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007164 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007165 }
7166 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007167 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007168 const Py_ssize_t n = PyBytes_Size(*outbytes);
7169 if (outsize > PY_SSIZE_T_MAX - n) {
7170 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007171 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007172 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007173 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007174 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7175 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007176 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007177 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007178 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007179 }
7180
7181 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007182 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007183 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007184 out, outsize,
7185 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007186 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007187 if (outsize <= 0)
7188 goto error;
7189 if (pusedDefaultChar && *pusedDefaultChar)
7190 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007191 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007192
Victor Stinner3a50e702011-10-18 21:21:00 +02007193error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007194 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007195 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7196 return -2;
7197 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007198 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007199}
7200
Victor Stinner3a50e702011-10-18 21:21:00 +02007201/*
Serhiy Storchakad65c9492015-11-02 14:10:23 +02007202 * Encode a Unicode string to a Windows code page into a byte string using an
Victor Stinner3a50e702011-10-18 21:21:00 +02007203 * error handler.
7204 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007205 * Returns consumed characters if succeed, or raise an OSError and returns
Victor Stinner3a50e702011-10-18 21:21:00 +02007206 * -1 on other error.
7207 */
7208static int
7209encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007210 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007211 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007212{
Victor Stinner3a50e702011-10-18 21:21:00 +02007213 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007214 Py_ssize_t pos = unicode_offset;
7215 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007216 /* Ideally, we should get reason from FormatMessage. This is the Windows
7217 2000 English version of the message. */
7218 const char *reason = "invalid character";
7219 /* 4=maximum length of a UTF-8 sequence */
7220 char buffer[4];
7221 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7222 Py_ssize_t outsize;
7223 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007224 PyObject *errorHandler = NULL;
7225 PyObject *exc = NULL;
7226 PyObject *encoding_obj = NULL;
7227 char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007228 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007229 PyObject *rep;
7230 int ret = -1;
7231
7232 assert(insize > 0);
7233
7234 encoding = code_page_name(code_page, &encoding_obj);
7235 if (encoding == NULL)
7236 return -1;
7237
7238 if (errors == NULL || strcmp(errors, "strict") == 0) {
7239 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7240 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007241 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007242 if (exc != NULL) {
7243 PyCodec_StrictErrors(exc);
7244 Py_DECREF(exc);
7245 }
7246 Py_XDECREF(encoding_obj);
7247 return -1;
7248 }
7249
7250 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7251 pusedDefaultChar = &usedDefaultChar;
7252 else
7253 pusedDefaultChar = NULL;
7254
7255 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7256 PyErr_NoMemory();
7257 goto error;
7258 }
7259 outsize = insize * Py_ARRAY_LENGTH(buffer);
7260
7261 if (*outbytes == NULL) {
7262 /* Create string object */
7263 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7264 if (*outbytes == NULL)
7265 goto error;
7266 out = PyBytes_AS_STRING(*outbytes);
7267 }
7268 else {
7269 /* Extend string object */
7270 Py_ssize_t n = PyBytes_Size(*outbytes);
7271 if (n > PY_SSIZE_T_MAX - outsize) {
7272 PyErr_NoMemory();
7273 goto error;
7274 }
7275 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7276 goto error;
7277 out = PyBytes_AS_STRING(*outbytes) + n;
7278 }
7279
7280 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007281 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007282 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007283 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7284 wchar_t chars[2];
7285 int charsize;
7286 if (ch < 0x10000) {
7287 chars[0] = (wchar_t)ch;
7288 charsize = 1;
7289 }
7290 else {
Victor Stinner76df43d2012-10-30 01:42:39 +01007291 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7292 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007293 charsize = 2;
7294 }
7295
Victor Stinner3a50e702011-10-18 21:21:00 +02007296 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007297 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007298 buffer, Py_ARRAY_LENGTH(buffer),
7299 NULL, pusedDefaultChar);
7300 if (outsize > 0) {
7301 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7302 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007303 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007304 memcpy(out, buffer, outsize);
7305 out += outsize;
7306 continue;
7307 }
7308 }
7309 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7310 PyErr_SetFromWindowsErr(0);
7311 goto error;
7312 }
7313
Victor Stinner3a50e702011-10-18 21:21:00 +02007314 rep = unicode_encode_call_errorhandler(
7315 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007316 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007317 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007318 if (rep == NULL)
7319 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007320 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007321
7322 if (PyBytes_Check(rep)) {
7323 outsize = PyBytes_GET_SIZE(rep);
7324 if (outsize != 1) {
7325 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7326 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7327 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7328 Py_DECREF(rep);
7329 goto error;
7330 }
7331 out = PyBytes_AS_STRING(*outbytes) + offset;
7332 }
7333 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7334 out += outsize;
7335 }
7336 else {
7337 Py_ssize_t i;
7338 enum PyUnicode_Kind kind;
7339 void *data;
7340
Benjamin Petersonbac79492012-01-14 13:34:47 -05007341 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007342 Py_DECREF(rep);
7343 goto error;
7344 }
7345
7346 outsize = PyUnicode_GET_LENGTH(rep);
7347 if (outsize != 1) {
7348 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7349 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7350 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7351 Py_DECREF(rep);
7352 goto error;
7353 }
7354 out = PyBytes_AS_STRING(*outbytes) + offset;
7355 }
7356 kind = PyUnicode_KIND(rep);
7357 data = PyUnicode_DATA(rep);
7358 for (i=0; i < outsize; i++) {
7359 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7360 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007361 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007362 encoding, unicode,
7363 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007364 "unable to encode error handler result to ASCII");
7365 Py_DECREF(rep);
7366 goto error;
7367 }
7368 *out = (unsigned char)ch;
7369 out++;
7370 }
7371 }
7372 Py_DECREF(rep);
7373 }
7374 /* write a NUL byte */
7375 *out = 0;
7376 outsize = out - PyBytes_AS_STRING(*outbytes);
7377 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7378 if (_PyBytes_Resize(outbytes, outsize) < 0)
7379 goto error;
7380 ret = 0;
7381
7382error:
7383 Py_XDECREF(encoding_obj);
7384 Py_XDECREF(errorHandler);
7385 Py_XDECREF(exc);
7386 return ret;
7387}
7388
Victor Stinner3a50e702011-10-18 21:21:00 +02007389static PyObject *
7390encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007391 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007392 const char *errors)
7393{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007394 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007395 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007396 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007397 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007398
Victor Stinner29dacf22015-01-26 16:41:32 +01007399 if (!PyUnicode_Check(unicode)) {
7400 PyErr_BadArgument();
7401 return NULL;
7402 }
7403
Benjamin Petersonbac79492012-01-14 13:34:47 -05007404 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007405 return NULL;
7406 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007407
Victor Stinner3a50e702011-10-18 21:21:00 +02007408 if (code_page < 0) {
7409 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7410 return NULL;
7411 }
7412
Martin v. Löwis3d325192011-11-04 18:23:06 +01007413 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007414 return PyBytes_FromStringAndSize(NULL, 0);
7415
Victor Stinner7581cef2011-11-03 22:32:33 +01007416 offset = 0;
7417 do
7418 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007419#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007420 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007421 chunks. */
7422 if (len > INT_MAX/2) {
7423 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007424 done = 0;
7425 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007426 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007427#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007428 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007429 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007430 done = 1;
7431 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007432
Victor Stinner76a31a62011-11-04 00:05:13 +01007433 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007434 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007435 errors);
7436 if (ret == -2)
7437 ret = encode_code_page_errors(code_page, &outbytes,
7438 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007439 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007440 if (ret < 0) {
7441 Py_XDECREF(outbytes);
7442 return NULL;
7443 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007444
Victor Stinner7581cef2011-11-03 22:32:33 +01007445 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007446 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007447 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007448
Victor Stinner3a50e702011-10-18 21:21:00 +02007449 return outbytes;
7450}
7451
7452PyObject *
7453PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7454 Py_ssize_t size,
7455 const char *errors)
7456{
Victor Stinner7581cef2011-11-03 22:32:33 +01007457 PyObject *unicode, *res;
7458 unicode = PyUnicode_FromUnicode(p, size);
7459 if (unicode == NULL)
7460 return NULL;
7461 res = encode_code_page(CP_ACP, unicode, errors);
7462 Py_DECREF(unicode);
7463 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007464}
7465
7466PyObject *
7467PyUnicode_EncodeCodePage(int code_page,
7468 PyObject *unicode,
7469 const char *errors)
7470{
Victor Stinner7581cef2011-11-03 22:32:33 +01007471 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007472}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007473
Alexander Belopolsky40018472011-02-26 01:02:56 +00007474PyObject *
7475PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007476{
Victor Stinner7581cef2011-11-03 22:32:33 +01007477 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007478}
7479
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007480#undef NEED_RETRY
7481
Victor Stinner99b95382011-07-04 14:23:54 +02007482#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007483
Guido van Rossumd57fd912000-03-10 22:53:23 +00007484/* --- Character Mapping Codec -------------------------------------------- */
7485
Victor Stinnerfb161b12013-04-18 01:44:27 +02007486static int
7487charmap_decode_string(const char *s,
7488 Py_ssize_t size,
7489 PyObject *mapping,
7490 const char *errors,
7491 _PyUnicodeWriter *writer)
7492{
7493 const char *starts = s;
7494 const char *e;
7495 Py_ssize_t startinpos, endinpos;
7496 PyObject *errorHandler = NULL, *exc = NULL;
7497 Py_ssize_t maplen;
7498 enum PyUnicode_Kind mapkind;
7499 void *mapdata;
7500 Py_UCS4 x;
7501 unsigned char ch;
7502
7503 if (PyUnicode_READY(mapping) == -1)
7504 return -1;
7505
7506 maplen = PyUnicode_GET_LENGTH(mapping);
7507 mapdata = PyUnicode_DATA(mapping);
7508 mapkind = PyUnicode_KIND(mapping);
7509
7510 e = s + size;
7511
7512 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
7513 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
7514 * is disabled in encoding aliases, latin1 is preferred because
7515 * its implementation is faster. */
7516 Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata;
7517 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7518 Py_UCS4 maxchar = writer->maxchar;
7519
7520 assert (writer->kind == PyUnicode_1BYTE_KIND);
7521 while (s < e) {
7522 ch = *s;
7523 x = mapdata_ucs1[ch];
7524 if (x > maxchar) {
7525 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
7526 goto onError;
7527 maxchar = writer->maxchar;
7528 outdata = (Py_UCS1 *)writer->data;
7529 }
7530 outdata[writer->pos] = x;
7531 writer->pos++;
7532 ++s;
7533 }
7534 return 0;
7535 }
7536
7537 while (s < e) {
7538 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
7539 enum PyUnicode_Kind outkind = writer->kind;
7540 Py_UCS2 *mapdata_ucs2 = (Py_UCS2 *)mapdata;
7541 if (outkind == PyUnicode_1BYTE_KIND) {
7542 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7543 Py_UCS4 maxchar = writer->maxchar;
7544 while (s < e) {
7545 ch = *s;
7546 x = mapdata_ucs2[ch];
7547 if (x > maxchar)
7548 goto Error;
7549 outdata[writer->pos] = x;
7550 writer->pos++;
7551 ++s;
7552 }
7553 break;
7554 }
7555 else if (outkind == PyUnicode_2BYTE_KIND) {
7556 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
7557 while (s < e) {
7558 ch = *s;
7559 x = mapdata_ucs2[ch];
7560 if (x == 0xFFFE)
7561 goto Error;
7562 outdata[writer->pos] = x;
7563 writer->pos++;
7564 ++s;
7565 }
7566 break;
7567 }
7568 }
7569 ch = *s;
7570
7571 if (ch < maplen)
7572 x = PyUnicode_READ(mapkind, mapdata, ch);
7573 else
7574 x = 0xfffe; /* invalid value */
7575Error:
7576 if (x == 0xfffe)
7577 {
7578 /* undefined mapping */
7579 startinpos = s-starts;
7580 endinpos = startinpos+1;
7581 if (unicode_decode_call_errorhandler_writer(
7582 errors, &errorHandler,
7583 "charmap", "character maps to <undefined>",
7584 &starts, &e, &startinpos, &endinpos, &exc, &s,
7585 writer)) {
7586 goto onError;
7587 }
7588 continue;
7589 }
7590
7591 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
7592 goto onError;
7593 ++s;
7594 }
7595 Py_XDECREF(errorHandler);
7596 Py_XDECREF(exc);
7597 return 0;
7598
7599onError:
7600 Py_XDECREF(errorHandler);
7601 Py_XDECREF(exc);
7602 return -1;
7603}
7604
7605static int
7606charmap_decode_mapping(const char *s,
7607 Py_ssize_t size,
7608 PyObject *mapping,
7609 const char *errors,
7610 _PyUnicodeWriter *writer)
7611{
7612 const char *starts = s;
7613 const char *e;
7614 Py_ssize_t startinpos, endinpos;
7615 PyObject *errorHandler = NULL, *exc = NULL;
7616 unsigned char ch;
Victor Stinnerf4f24242013-05-07 01:01:31 +02007617 PyObject *key, *item = NULL;
Victor Stinnerfb161b12013-04-18 01:44:27 +02007618
7619 e = s + size;
7620
7621 while (s < e) {
7622 ch = *s;
7623
7624 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7625 key = PyLong_FromLong((long)ch);
7626 if (key == NULL)
7627 goto onError;
7628
7629 item = PyObject_GetItem(mapping, key);
7630 Py_DECREF(key);
7631 if (item == NULL) {
7632 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7633 /* No mapping found means: mapping is undefined. */
7634 PyErr_Clear();
7635 goto Undefined;
7636 } else
7637 goto onError;
7638 }
7639
7640 /* Apply mapping */
7641 if (item == Py_None)
7642 goto Undefined;
7643 if (PyLong_Check(item)) {
7644 long value = PyLong_AS_LONG(item);
7645 if (value == 0xFFFE)
7646 goto Undefined;
7647 if (value < 0 || value > MAX_UNICODE) {
7648 PyErr_Format(PyExc_TypeError,
7649 "character mapping must be in range(0x%lx)",
7650 (unsigned long)MAX_UNICODE + 1);
7651 goto onError;
7652 }
7653
7654 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7655 goto onError;
7656 }
7657 else if (PyUnicode_Check(item)) {
7658 if (PyUnicode_READY(item) == -1)
7659 goto onError;
7660 if (PyUnicode_GET_LENGTH(item) == 1) {
7661 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
7662 if (value == 0xFFFE)
7663 goto Undefined;
7664 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7665 goto onError;
7666 }
7667 else {
7668 writer->overallocate = 1;
7669 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
7670 goto onError;
7671 }
7672 }
7673 else {
7674 /* wrong return value */
7675 PyErr_SetString(PyExc_TypeError,
7676 "character mapping must return integer, None or str");
7677 goto onError;
7678 }
7679 Py_CLEAR(item);
7680 ++s;
7681 continue;
7682
7683Undefined:
7684 /* undefined mapping */
7685 Py_CLEAR(item);
7686 startinpos = s-starts;
7687 endinpos = startinpos+1;
7688 if (unicode_decode_call_errorhandler_writer(
7689 errors, &errorHandler,
7690 "charmap", "character maps to <undefined>",
7691 &starts, &e, &startinpos, &endinpos, &exc, &s,
7692 writer)) {
7693 goto onError;
7694 }
7695 }
7696 Py_XDECREF(errorHandler);
7697 Py_XDECREF(exc);
7698 return 0;
7699
7700onError:
7701 Py_XDECREF(item);
7702 Py_XDECREF(errorHandler);
7703 Py_XDECREF(exc);
7704 return -1;
7705}
7706
Alexander Belopolsky40018472011-02-26 01:02:56 +00007707PyObject *
7708PyUnicode_DecodeCharmap(const char *s,
7709 Py_ssize_t size,
7710 PyObject *mapping,
7711 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007712{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007713 _PyUnicodeWriter writer;
Tim Petersced69f82003-09-16 20:30:58 +00007714
Guido van Rossumd57fd912000-03-10 22:53:23 +00007715 /* Default to Latin-1 */
7716 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007717 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007718
Guido van Rossumd57fd912000-03-10 22:53:23 +00007719 if (size == 0)
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02007720 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner8f674cc2013-04-17 23:02:17 +02007721 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02007722 writer.min_length = size;
7723 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007724 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007725
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007726 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerfb161b12013-04-18 01:44:27 +02007727 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
7728 goto onError;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007729 }
7730 else {
Victor Stinnerfb161b12013-04-18 01:44:27 +02007731 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
7732 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007733 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007734 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00007735
Benjamin Peterson29060642009-01-31 22:14:21 +00007736 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007737 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007738 return NULL;
7739}
7740
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007741/* Charmap encoding: the lookup table */
7742
Alexander Belopolsky40018472011-02-26 01:02:56 +00007743struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00007744 PyObject_HEAD
7745 unsigned char level1[32];
7746 int count2, count3;
7747 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007748};
7749
7750static PyObject*
7751encoding_map_size(PyObject *obj, PyObject* args)
7752{
7753 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007754 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00007755 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007756}
7757
7758static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007759 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00007760 PyDoc_STR("Return the size (in bytes) of this object") },
7761 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007762};
7763
7764static void
7765encoding_map_dealloc(PyObject* o)
7766{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007767 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007768}
7769
7770static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007771 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007772 "EncodingMap", /*tp_name*/
7773 sizeof(struct encoding_map), /*tp_basicsize*/
7774 0, /*tp_itemsize*/
7775 /* methods */
7776 encoding_map_dealloc, /*tp_dealloc*/
7777 0, /*tp_print*/
7778 0, /*tp_getattr*/
7779 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00007780 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00007781 0, /*tp_repr*/
7782 0, /*tp_as_number*/
7783 0, /*tp_as_sequence*/
7784 0, /*tp_as_mapping*/
7785 0, /*tp_hash*/
7786 0, /*tp_call*/
7787 0, /*tp_str*/
7788 0, /*tp_getattro*/
7789 0, /*tp_setattro*/
7790 0, /*tp_as_buffer*/
7791 Py_TPFLAGS_DEFAULT, /*tp_flags*/
7792 0, /*tp_doc*/
7793 0, /*tp_traverse*/
7794 0, /*tp_clear*/
7795 0, /*tp_richcompare*/
7796 0, /*tp_weaklistoffset*/
7797 0, /*tp_iter*/
7798 0, /*tp_iternext*/
7799 encoding_map_methods, /*tp_methods*/
7800 0, /*tp_members*/
7801 0, /*tp_getset*/
7802 0, /*tp_base*/
7803 0, /*tp_dict*/
7804 0, /*tp_descr_get*/
7805 0, /*tp_descr_set*/
7806 0, /*tp_dictoffset*/
7807 0, /*tp_init*/
7808 0, /*tp_alloc*/
7809 0, /*tp_new*/
7810 0, /*tp_free*/
7811 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007812};
7813
7814PyObject*
7815PyUnicode_BuildEncodingMap(PyObject* string)
7816{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007817 PyObject *result;
7818 struct encoding_map *mresult;
7819 int i;
7820 int need_dict = 0;
7821 unsigned char level1[32];
7822 unsigned char level2[512];
7823 unsigned char *mlevel1, *mlevel2, *mlevel3;
7824 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007825 int kind;
7826 void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007827 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007828 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007829
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007830 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007831 PyErr_BadArgument();
7832 return NULL;
7833 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007834 kind = PyUnicode_KIND(string);
7835 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007836 length = PyUnicode_GET_LENGTH(string);
7837 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007838 memset(level1, 0xFF, sizeof level1);
7839 memset(level2, 0xFF, sizeof level2);
7840
7841 /* If there isn't a one-to-one mapping of NULL to \0,
7842 or if there are non-BMP characters, we need to use
7843 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007844 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007845 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007846 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007847 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007848 ch = PyUnicode_READ(kind, data, i);
7849 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007850 need_dict = 1;
7851 break;
7852 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007853 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007854 /* unmapped character */
7855 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007856 l1 = ch >> 11;
7857 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007858 if (level1[l1] == 0xFF)
7859 level1[l1] = count2++;
7860 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00007861 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007862 }
7863
7864 if (count2 >= 0xFF || count3 >= 0xFF)
7865 need_dict = 1;
7866
7867 if (need_dict) {
7868 PyObject *result = PyDict_New();
7869 PyObject *key, *value;
7870 if (!result)
7871 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007872 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007873 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00007874 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007875 if (!key || !value)
7876 goto failed1;
7877 if (PyDict_SetItem(result, key, value) == -1)
7878 goto failed1;
7879 Py_DECREF(key);
7880 Py_DECREF(value);
7881 }
7882 return result;
7883 failed1:
7884 Py_XDECREF(key);
7885 Py_XDECREF(value);
7886 Py_DECREF(result);
7887 return NULL;
7888 }
7889
7890 /* Create a three-level trie */
7891 result = PyObject_MALLOC(sizeof(struct encoding_map) +
7892 16*count2 + 128*count3 - 1);
7893 if (!result)
7894 return PyErr_NoMemory();
7895 PyObject_Init(result, &EncodingMapType);
7896 mresult = (struct encoding_map*)result;
7897 mresult->count2 = count2;
7898 mresult->count3 = count3;
7899 mlevel1 = mresult->level1;
7900 mlevel2 = mresult->level23;
7901 mlevel3 = mresult->level23 + 16*count2;
7902 memcpy(mlevel1, level1, 32);
7903 memset(mlevel2, 0xFF, 16*count2);
7904 memset(mlevel3, 0, 128*count3);
7905 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007906 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007907 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007908 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7909 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007910 /* unmapped character */
7911 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007912 o1 = ch>>11;
7913 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007914 i2 = 16*mlevel1[o1] + o2;
7915 if (mlevel2[i2] == 0xFF)
7916 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007917 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007918 i3 = 128*mlevel2[i2] + o3;
7919 mlevel3[i3] = i;
7920 }
7921 return result;
7922}
7923
7924static int
Victor Stinner22168992011-11-20 17:09:18 +01007925encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007926{
7927 struct encoding_map *map = (struct encoding_map*)mapping;
7928 int l1 = c>>11;
7929 int l2 = (c>>7) & 0xF;
7930 int l3 = c & 0x7F;
7931 int i;
7932
Victor Stinner22168992011-11-20 17:09:18 +01007933 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00007934 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007935 if (c == 0)
7936 return 0;
7937 /* level 1*/
7938 i = map->level1[l1];
7939 if (i == 0xFF) {
7940 return -1;
7941 }
7942 /* level 2*/
7943 i = map->level23[16*i+l2];
7944 if (i == 0xFF) {
7945 return -1;
7946 }
7947 /* level 3 */
7948 i = map->level23[16*map->count2 + 128*i + l3];
7949 if (i == 0) {
7950 return -1;
7951 }
7952 return i;
7953}
7954
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007955/* Lookup the character ch in the mapping. If the character
7956 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00007957 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007958static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01007959charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007960{
Christian Heimes217cfd12007-12-02 14:31:20 +00007961 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007962 PyObject *x;
7963
7964 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007965 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007966 x = PyObject_GetItem(mapping, w);
7967 Py_DECREF(w);
7968 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007969 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7970 /* No mapping found means: mapping is undefined. */
7971 PyErr_Clear();
7972 x = Py_None;
7973 Py_INCREF(x);
7974 return x;
7975 } else
7976 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007977 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00007978 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007979 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00007980 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007981 long value = PyLong_AS_LONG(x);
7982 if (value < 0 || value > 255) {
7983 PyErr_SetString(PyExc_TypeError,
7984 "character mapping must be in range(256)");
7985 Py_DECREF(x);
7986 return NULL;
7987 }
7988 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007989 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007990 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007991 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007992 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007993 /* wrong return value */
7994 PyErr_Format(PyExc_TypeError,
7995 "character mapping must return integer, bytes or None, not %.400s",
7996 x->ob_type->tp_name);
7997 Py_DECREF(x);
7998 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007999 }
8000}
8001
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008002static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008003charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008004{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008005 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8006 /* exponentially overallocate to minimize reallocations */
8007 if (requiredsize < 2*outsize)
8008 requiredsize = 2*outsize;
8009 if (_PyBytes_Resize(outobj, requiredsize))
8010 return -1;
8011 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008012}
8013
Benjamin Peterson14339b62009-01-31 16:36:08 +00008014typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008015 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008016} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008017/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008018 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008019 space is available. Return a new reference to the object that
8020 was put in the output buffer, or Py_None, if the mapping was undefined
8021 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008022 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008023static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008024charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008025 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008026{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008027 PyObject *rep;
8028 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008029 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008030
Christian Heimes90aa7642007-12-19 02:45:37 +00008031 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008032 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008033 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008034 if (res == -1)
8035 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008036 if (outsize<requiredsize)
8037 if (charmapencode_resize(outobj, outpos, requiredsize))
8038 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008039 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008040 outstart[(*outpos)++] = (char)res;
8041 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008042 }
8043
8044 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008045 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008046 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008047 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008048 Py_DECREF(rep);
8049 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008050 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008051 if (PyLong_Check(rep)) {
8052 Py_ssize_t requiredsize = *outpos+1;
8053 if (outsize<requiredsize)
8054 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8055 Py_DECREF(rep);
8056 return enc_EXCEPTION;
8057 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008058 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008059 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008060 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008061 else {
8062 const char *repchars = PyBytes_AS_STRING(rep);
8063 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8064 Py_ssize_t requiredsize = *outpos+repsize;
8065 if (outsize<requiredsize)
8066 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8067 Py_DECREF(rep);
8068 return enc_EXCEPTION;
8069 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008070 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008071 memcpy(outstart + *outpos, repchars, repsize);
8072 *outpos += repsize;
8073 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008074 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008075 Py_DECREF(rep);
8076 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008077}
8078
8079/* handle an error in PyUnicode_EncodeCharmap
8080 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008081static int
8082charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008083 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008084 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00008085 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008086 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008087{
8088 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008089 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008090 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008091 enum PyUnicode_Kind kind;
8092 void *data;
8093 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008094 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008095 Py_ssize_t collstartpos = *inpos;
8096 Py_ssize_t collendpos = *inpos+1;
8097 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008098 char *encoding = "charmap";
8099 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008100 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008101 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008102 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008103
Benjamin Petersonbac79492012-01-14 13:34:47 -05008104 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008105 return -1;
8106 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008107 /* find all unencodable characters */
8108 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008109 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008110 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008111 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008112 val = encoding_map_lookup(ch, mapping);
8113 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008114 break;
8115 ++collendpos;
8116 continue;
8117 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008118
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008119 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8120 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008121 if (rep==NULL)
8122 return -1;
8123 else if (rep!=Py_None) {
8124 Py_DECREF(rep);
8125 break;
8126 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008127 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008128 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008129 }
8130 /* cache callback name lookup
8131 * (if not done yet, i.e. it's the first error) */
8132 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008133 if ((errors==NULL) || (!strcmp(errors, "strict")))
8134 *known_errorHandler = 1;
8135 else if (!strcmp(errors, "replace"))
8136 *known_errorHandler = 2;
8137 else if (!strcmp(errors, "ignore"))
8138 *known_errorHandler = 3;
8139 else if (!strcmp(errors, "xmlcharrefreplace"))
8140 *known_errorHandler = 4;
8141 else
8142 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008143 }
8144 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008145 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008146 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008147 return -1;
8148 case 2: /* replace */
8149 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008150 x = charmapencode_output('?', mapping, res, respos);
8151 if (x==enc_EXCEPTION) {
8152 return -1;
8153 }
8154 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008155 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008156 return -1;
8157 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008158 }
8159 /* fall through */
8160 case 3: /* ignore */
8161 *inpos = collendpos;
8162 break;
8163 case 4: /* xmlcharrefreplace */
8164 /* generate replacement (temporarily (mis)uses p) */
8165 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008166 char buffer[2+29+1+1];
8167 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008168 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008169 for (cp = buffer; *cp; ++cp) {
8170 x = charmapencode_output(*cp, mapping, res, respos);
8171 if (x==enc_EXCEPTION)
8172 return -1;
8173 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008174 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008175 return -1;
8176 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008177 }
8178 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008179 *inpos = collendpos;
8180 break;
8181 default:
8182 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008183 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008184 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008185 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008186 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008187 if (PyBytes_Check(repunicode)) {
8188 /* Directly copy bytes result to output. */
8189 Py_ssize_t outsize = PyBytes_Size(*res);
8190 Py_ssize_t requiredsize;
8191 repsize = PyBytes_Size(repunicode);
8192 requiredsize = *respos + repsize;
8193 if (requiredsize > outsize)
8194 /* Make room for all additional bytes. */
8195 if (charmapencode_resize(res, respos, requiredsize)) {
8196 Py_DECREF(repunicode);
8197 return -1;
8198 }
8199 memcpy(PyBytes_AsString(*res) + *respos,
8200 PyBytes_AsString(repunicode), repsize);
8201 *respos += repsize;
8202 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008203 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008204 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008205 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008206 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008207 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008208 Py_DECREF(repunicode);
8209 return -1;
8210 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008211 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008212 data = PyUnicode_DATA(repunicode);
8213 kind = PyUnicode_KIND(repunicode);
8214 for (index = 0; index < repsize; index++) {
8215 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8216 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008217 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008218 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008219 return -1;
8220 }
8221 else if (x==enc_FAILED) {
8222 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008223 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008224 return -1;
8225 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008226 }
8227 *inpos = newpos;
8228 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008229 }
8230 return 0;
8231}
8232
Alexander Belopolsky40018472011-02-26 01:02:56 +00008233PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008234_PyUnicode_EncodeCharmap(PyObject *unicode,
8235 PyObject *mapping,
8236 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008237{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008238 /* output object */
8239 PyObject *res = NULL;
8240 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008241 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008242 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008243 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008244 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008245 PyObject *errorHandler = NULL;
8246 PyObject *exc = NULL;
8247 /* the following variable is used for caching string comparisons
8248 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8249 * 3=ignore, 4=xmlcharrefreplace */
8250 int known_errorHandler = -1;
Victor Stinner69ed0f42013-04-09 21:48:24 +02008251 void *data;
8252 int kind;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008253
Benjamin Petersonbac79492012-01-14 13:34:47 -05008254 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008255 return NULL;
8256 size = PyUnicode_GET_LENGTH(unicode);
Victor Stinner69ed0f42013-04-09 21:48:24 +02008257 data = PyUnicode_DATA(unicode);
8258 kind = PyUnicode_KIND(unicode);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008259
Guido van Rossumd57fd912000-03-10 22:53:23 +00008260 /* Default to Latin-1 */
8261 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008262 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008263
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008264 /* allocate enough for a simple encoding without
8265 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008266 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008267 if (res == NULL)
8268 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008269 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008270 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008271
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008272 while (inpos<size) {
Victor Stinner69ed0f42013-04-09 21:48:24 +02008273 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008274 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008275 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008276 if (x==enc_EXCEPTION) /* error */
8277 goto onError;
8278 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008279 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008280 &exc,
8281 &known_errorHandler, &errorHandler, errors,
8282 &res, &respos)) {
8283 goto onError;
8284 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008285 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008286 else
8287 /* done with this character => adjust input position */
8288 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008289 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008290
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008291 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008292 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008293 if (_PyBytes_Resize(&res, respos) < 0)
8294 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008295
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008296 Py_XDECREF(exc);
8297 Py_XDECREF(errorHandler);
8298 return res;
8299
Benjamin Peterson29060642009-01-31 22:14:21 +00008300 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008301 Py_XDECREF(res);
8302 Py_XDECREF(exc);
8303 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008304 return NULL;
8305}
8306
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008307/* Deprecated */
8308PyObject *
8309PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8310 Py_ssize_t size,
8311 PyObject *mapping,
8312 const char *errors)
8313{
8314 PyObject *result;
8315 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8316 if (unicode == NULL)
8317 return NULL;
8318 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8319 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008320 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008321}
8322
Alexander Belopolsky40018472011-02-26 01:02:56 +00008323PyObject *
8324PyUnicode_AsCharmapString(PyObject *unicode,
8325 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008326{
8327 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008328 PyErr_BadArgument();
8329 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008330 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008331 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008332}
8333
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008334/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008335static void
8336make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008337 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008338 Py_ssize_t startpos, Py_ssize_t endpos,
8339 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008340{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008341 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008342 *exceptionObject = _PyUnicodeTranslateError_Create(
8343 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008344 }
8345 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008346 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8347 goto onError;
8348 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8349 goto onError;
8350 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8351 goto onError;
8352 return;
8353 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02008354 Py_CLEAR(*exceptionObject);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008355 }
8356}
8357
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008358/* error handling callback helper:
8359 build arguments, call the callback and check the arguments,
8360 put the result into newpos and return the replacement string, which
8361 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008362static PyObject *
8363unicode_translate_call_errorhandler(const char *errors,
8364 PyObject **errorHandler,
8365 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008366 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008367 Py_ssize_t startpos, Py_ssize_t endpos,
8368 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008369{
Benjamin Peterson142957c2008-07-04 19:55:29 +00008370 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008371
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008372 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008373 PyObject *restuple;
8374 PyObject *resunicode;
8375
8376 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008377 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008378 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008379 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008380 }
8381
8382 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008383 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008384 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008385 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008386
8387 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00008388 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008389 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008390 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008391 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00008392 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008393 Py_DECREF(restuple);
8394 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008395 }
8396 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00008397 &resunicode, &i_newpos)) {
8398 Py_DECREF(restuple);
8399 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008400 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008401 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008402 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008403 else
8404 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008405 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnera33bce02014-07-04 22:47:46 +02008406 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008407 Py_DECREF(restuple);
8408 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008409 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008410 Py_INCREF(resunicode);
8411 Py_DECREF(restuple);
8412 return resunicode;
8413}
8414
8415/* Lookup the character ch in the mapping and put the result in result,
8416 which must be decrefed by the caller.
8417 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008418static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008419charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008420{
Christian Heimes217cfd12007-12-02 14:31:20 +00008421 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008422 PyObject *x;
8423
8424 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008425 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008426 x = PyObject_GetItem(mapping, w);
8427 Py_DECREF(w);
8428 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008429 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8430 /* No mapping found means: use 1:1 mapping. */
8431 PyErr_Clear();
8432 *result = NULL;
8433 return 0;
8434 } else
8435 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008436 }
8437 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008438 *result = x;
8439 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008440 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008441 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008442 long value = PyLong_AS_LONG(x);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008443 if (value < 0 || value > MAX_UNICODE) {
8444 PyErr_Format(PyExc_ValueError,
8445 "character mapping must be in range(0x%x)",
8446 MAX_UNICODE+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008447 Py_DECREF(x);
8448 return -1;
8449 }
8450 *result = x;
8451 return 0;
8452 }
8453 else if (PyUnicode_Check(x)) {
8454 *result = x;
8455 return 0;
8456 }
8457 else {
8458 /* wrong return value */
8459 PyErr_SetString(PyExc_TypeError,
8460 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008461 Py_DECREF(x);
8462 return -1;
8463 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008464}
Victor Stinner1194ea02014-04-04 19:37:40 +02008465
8466/* lookup the character, write the result into the writer.
8467 Return 1 if the result was written into the writer, return 0 if the mapping
8468 was undefined, raise an exception return -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008469static int
Victor Stinner1194ea02014-04-04 19:37:40 +02008470charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
8471 _PyUnicodeWriter *writer)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008472{
Victor Stinner1194ea02014-04-04 19:37:40 +02008473 PyObject *item;
8474
8475 if (charmaptranslate_lookup(ch, mapping, &item))
Benjamin Peterson29060642009-01-31 22:14:21 +00008476 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008477
8478 if (item == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008479 /* not found => default to 1:1 mapping */
Victor Stinner1194ea02014-04-04 19:37:40 +02008480 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008481 return -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008482 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008483 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008484 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008485
8486 if (item == Py_None) {
8487 Py_DECREF(item);
8488 return 0;
8489 }
8490
8491 if (PyLong_Check(item)) {
Victor Stinner4ff33af2014-04-05 11:56:37 +02008492 long ch = (Py_UCS4)PyLong_AS_LONG(item);
8493 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8494 used it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008495 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
8496 Py_DECREF(item);
8497 return -1;
8498 }
8499 Py_DECREF(item);
8500 return 1;
8501 }
8502
8503 if (!PyUnicode_Check(item)) {
8504 Py_DECREF(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00008505 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008506 }
8507
8508 if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
8509 Py_DECREF(item);
8510 return -1;
8511 }
8512
8513 Py_DECREF(item);
8514 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008515}
8516
Victor Stinner89a76ab2014-04-05 11:44:04 +02008517static int
8518unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
8519 Py_UCS1 *translate)
8520{
Benjamin Peterson1365de72014-04-07 20:15:41 -04008521 PyObject *item = NULL;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008522 int ret = 0;
8523
Victor Stinner89a76ab2014-04-05 11:44:04 +02008524 if (charmaptranslate_lookup(ch, mapping, &item)) {
8525 return -1;
8526 }
8527
8528 if (item == Py_None) {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008529 /* deletion */
Victor Stinner872b2912014-04-05 14:27:07 +02008530 translate[ch] = 0xfe;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008531 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008532 else if (item == NULL) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008533 /* not found => default to 1:1 mapping */
8534 translate[ch] = ch;
8535 return 1;
8536 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008537 else if (PyLong_Check(item)) {
Victor Stinner4dd25252014-04-08 09:14:21 +02008538 long replace = PyLong_AS_LONG(item);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008539 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8540 used it */
8541 if (127 < replace) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008542 /* invalid character or character outside ASCII:
8543 skip the fast translate */
8544 goto exit;
8545 }
8546 translate[ch] = (Py_UCS1)replace;
8547 }
8548 else if (PyUnicode_Check(item)) {
8549 Py_UCS4 replace;
8550
8551 if (PyUnicode_READY(item) == -1) {
8552 Py_DECREF(item);
8553 return -1;
8554 }
8555 if (PyUnicode_GET_LENGTH(item) != 1)
8556 goto exit;
8557
8558 replace = PyUnicode_READ_CHAR(item, 0);
8559 if (replace > 127)
8560 goto exit;
8561 translate[ch] = (Py_UCS1)replace;
8562 }
8563 else {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008564 /* not None, NULL, long or unicode */
Victor Stinner89a76ab2014-04-05 11:44:04 +02008565 goto exit;
8566 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02008567 ret = 1;
8568
Benjamin Peterson1365de72014-04-07 20:15:41 -04008569 exit:
8570 Py_DECREF(item);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008571 return ret;
8572}
8573
8574/* Fast path for ascii => ascii translation. Return 1 if the whole string
8575 was translated into writer, return 0 if the input string was partially
8576 translated into writer, raise an exception and return -1 on error. */
8577static int
8578unicode_fast_translate(PyObject *input, PyObject *mapping,
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01008579 _PyUnicodeWriter *writer, int ignore,
8580 Py_ssize_t *input_pos)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008581{
Victor Stinner872b2912014-04-05 14:27:07 +02008582 Py_UCS1 ascii_table[128], ch, ch2;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008583 Py_ssize_t len;
8584 Py_UCS1 *in, *end, *out;
Victor Stinner872b2912014-04-05 14:27:07 +02008585 int res = 0;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008586
Victor Stinner89a76ab2014-04-05 11:44:04 +02008587 len = PyUnicode_GET_LENGTH(input);
8588
Victor Stinner872b2912014-04-05 14:27:07 +02008589 memset(ascii_table, 0xff, 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008590
8591 in = PyUnicode_1BYTE_DATA(input);
8592 end = in + len;
8593
8594 assert(PyUnicode_IS_ASCII(writer->buffer));
8595 assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
8596 out = PyUnicode_1BYTE_DATA(writer->buffer);
8597
Victor Stinner872b2912014-04-05 14:27:07 +02008598 for (; in < end; in++) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008599 ch = *in;
Victor Stinner872b2912014-04-05 14:27:07 +02008600 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008601 if (ch2 == 0xff) {
Victor Stinner872b2912014-04-05 14:27:07 +02008602 int translate = unicode_fast_translate_lookup(mapping, ch,
8603 ascii_table);
8604 if (translate < 0)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008605 return -1;
Victor Stinner872b2912014-04-05 14:27:07 +02008606 if (translate == 0)
8607 goto exit;
8608 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008609 }
Victor Stinner872b2912014-04-05 14:27:07 +02008610 if (ch2 == 0xfe) {
8611 if (ignore)
8612 continue;
8613 goto exit;
8614 }
8615 assert(ch2 < 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008616 *out = ch2;
Victor Stinner872b2912014-04-05 14:27:07 +02008617 out++;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008618 }
Victor Stinner872b2912014-04-05 14:27:07 +02008619 res = 1;
8620
8621exit:
8622 writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01008623 *input_pos = in - PyUnicode_1BYTE_DATA(input);
Victor Stinner872b2912014-04-05 14:27:07 +02008624 return res;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008625}
8626
Alexander Belopolsky40018472011-02-26 01:02:56 +00008627PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008628_PyUnicode_TranslateCharmap(PyObject *input,
8629 PyObject *mapping,
8630 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008631{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008632 /* input object */
Victor Stinner1194ea02014-04-04 19:37:40 +02008633 char *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008634 Py_ssize_t size, i;
8635 int kind;
8636 /* output buffer */
Victor Stinner1194ea02014-04-04 19:37:40 +02008637 _PyUnicodeWriter writer;
8638 /* error handler */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008639 char *reason = "character maps to <undefined>";
8640 PyObject *errorHandler = NULL;
8641 PyObject *exc = NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008642 int ignore;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008643 int res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008644
Guido van Rossumd57fd912000-03-10 22:53:23 +00008645 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008646 PyErr_BadArgument();
8647 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008648 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008649
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008650 if (PyUnicode_READY(input) == -1)
8651 return NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008652 data = (char*)PyUnicode_DATA(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008653 kind = PyUnicode_KIND(input);
8654 size = PyUnicode_GET_LENGTH(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008655
8656 if (size == 0) {
8657 Py_INCREF(input);
8658 return input;
8659 }
8660
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008661 /* allocate enough for a simple 1:1 translation without
8662 replacements, if we need more, we'll resize */
Victor Stinner1194ea02014-04-04 19:37:40 +02008663 _PyUnicodeWriter_Init(&writer);
8664 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008665 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008666
Victor Stinner872b2912014-04-05 14:27:07 +02008667 ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
8668
Victor Stinner33798672016-03-01 21:59:58 +01008669 if (PyUnicode_READY(input) == -1)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008670 return NULL;
Victor Stinner33798672016-03-01 21:59:58 +01008671 if (PyUnicode_IS_ASCII(input)) {
8672 res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
8673 if (res < 0) {
8674 _PyUnicodeWriter_Dealloc(&writer);
8675 return NULL;
8676 }
8677 if (res == 1)
8678 return _PyUnicodeWriter_Finish(&writer);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008679 }
Victor Stinner33798672016-03-01 21:59:58 +01008680 else {
8681 i = 0;
8682 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02008683
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008684 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008685 /* try to encode it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008686 int translate;
8687 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8688 Py_ssize_t newpos;
8689 /* startpos for collecting untranslatable chars */
8690 Py_ssize_t collstart;
8691 Py_ssize_t collend;
Victor Stinner1194ea02014-04-04 19:37:40 +02008692 Py_UCS4 ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008693
Victor Stinner1194ea02014-04-04 19:37:40 +02008694 ch = PyUnicode_READ(kind, data, i);
8695 translate = charmaptranslate_output(ch, mapping, &writer);
8696 if (translate < 0)
8697 goto onError;
8698
8699 if (translate != 0) {
8700 /* it worked => adjust input pointer */
8701 ++i;
8702 continue;
8703 }
8704
8705 /* untranslatable character */
8706 collstart = i;
8707 collend = i+1;
8708
8709 /* find all untranslatable characters */
8710 while (collend < size) {
8711 PyObject *x;
8712 ch = PyUnicode_READ(kind, data, collend);
8713 if (charmaptranslate_lookup(ch, mapping, &x))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008714 goto onError;
Victor Stinner1194ea02014-04-04 19:37:40 +02008715 Py_XDECREF(x);
8716 if (x != Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008717 break;
Victor Stinner1194ea02014-04-04 19:37:40 +02008718 ++collend;
8719 }
8720
8721 if (ignore) {
8722 i = collend;
8723 }
8724 else {
8725 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
8726 reason, input, &exc,
8727 collstart, collend, &newpos);
8728 if (repunicode == NULL)
8729 goto onError;
8730 if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008731 Py_DECREF(repunicode);
Victor Stinner1194ea02014-04-04 19:37:40 +02008732 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008733 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008734 Py_DECREF(repunicode);
8735 i = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008736 }
8737 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008738 Py_XDECREF(exc);
8739 Py_XDECREF(errorHandler);
Victor Stinner1194ea02014-04-04 19:37:40 +02008740 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008741
Benjamin Peterson29060642009-01-31 22:14:21 +00008742 onError:
Victor Stinner1194ea02014-04-04 19:37:40 +02008743 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008744 Py_XDECREF(exc);
8745 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008746 return NULL;
8747}
8748
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008749/* Deprecated. Use PyUnicode_Translate instead. */
8750PyObject *
8751PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8752 Py_ssize_t size,
8753 PyObject *mapping,
8754 const char *errors)
8755{
Christian Heimes5f520f42012-09-11 14:03:25 +02008756 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008757 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8758 if (!unicode)
8759 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02008760 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8761 Py_DECREF(unicode);
8762 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008763}
8764
Alexander Belopolsky40018472011-02-26 01:02:56 +00008765PyObject *
8766PyUnicode_Translate(PyObject *str,
8767 PyObject *mapping,
8768 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008769{
8770 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008771
Guido van Rossumd57fd912000-03-10 22:53:23 +00008772 str = PyUnicode_FromObject(str);
8773 if (str == NULL)
Christian Heimes5f520f42012-09-11 14:03:25 +02008774 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008775 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008776 Py_DECREF(str);
8777 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008778}
Tim Petersced69f82003-09-16 20:30:58 +00008779
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008780static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008781fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008782{
8783 /* No need to call PyUnicode_READY(self) because this function is only
8784 called as a callback from fixup() which does it already. */
8785 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8786 const int kind = PyUnicode_KIND(self);
8787 void *data = PyUnicode_DATA(self);
Victor Stinnere6abb482012-05-02 01:15:40 +02008788 Py_UCS4 maxchar = 127, ch, fixed;
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008789 int modified = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008790 Py_ssize_t i;
8791
8792 for (i = 0; i < len; ++i) {
8793 ch = PyUnicode_READ(kind, data, i);
8794 fixed = 0;
8795 if (ch > 127) {
8796 if (Py_UNICODE_ISSPACE(ch))
8797 fixed = ' ';
8798 else {
8799 const int decimal = Py_UNICODE_TODECIMAL(ch);
8800 if (decimal >= 0)
8801 fixed = '0' + decimal;
8802 }
8803 if (fixed != 0) {
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008804 modified = 1;
Benjamin Peterson7e303732013-06-10 09:19:46 -07008805 maxchar = Py_MAX(maxchar, fixed);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008806 PyUnicode_WRITE(kind, data, i, fixed);
8807 }
Victor Stinnere6abb482012-05-02 01:15:40 +02008808 else
Benjamin Peterson7e303732013-06-10 09:19:46 -07008809 maxchar = Py_MAX(maxchar, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008810 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008811 }
8812
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008813 return (modified) ? maxchar : 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008814}
8815
8816PyObject *
8817_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8818{
8819 if (!PyUnicode_Check(unicode)) {
8820 PyErr_BadInternalCall();
8821 return NULL;
8822 }
8823 if (PyUnicode_READY(unicode) == -1)
8824 return NULL;
8825 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8826 /* If the string is already ASCII, just return the same string */
8827 Py_INCREF(unicode);
8828 return unicode;
8829 }
Victor Stinner9310abb2011-10-05 00:59:23 +02008830 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008831}
8832
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008833PyObject *
8834PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8835 Py_ssize_t length)
8836{
Victor Stinnerf0124502011-11-21 23:12:56 +01008837 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008838 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01008839 Py_UCS4 maxchar;
8840 enum PyUnicode_Kind kind;
8841 void *data;
8842
Victor Stinner99d7ad02012-02-22 13:37:39 +01008843 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008844 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02008845 Py_UCS4 ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008846 if (ch > 127) {
8847 int decimal = Py_UNICODE_TODECIMAL(ch);
8848 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01008849 ch = '0' + decimal;
Benjamin Peterson7e303732013-06-10 09:19:46 -07008850 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008851 }
8852 }
Victor Stinnerf0124502011-11-21 23:12:56 +01008853
8854 /* Copy to a new string */
8855 decimal = PyUnicode_New(length, maxchar);
8856 if (decimal == NULL)
8857 return decimal;
8858 kind = PyUnicode_KIND(decimal);
8859 data = PyUnicode_DATA(decimal);
8860 /* Iterate over code points */
8861 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02008862 Py_UCS4 ch = s[i];
Victor Stinnerf0124502011-11-21 23:12:56 +01008863 if (ch > 127) {
8864 int decimal = Py_UNICODE_TODECIMAL(ch);
8865 if (decimal >= 0)
8866 ch = '0' + decimal;
8867 }
8868 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008869 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008870 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008871}
Guido van Rossum9e896b32000-04-05 20:11:21 +00008872/* --- Decimal Encoder ---------------------------------------------------- */
8873
Alexander Belopolsky40018472011-02-26 01:02:56 +00008874int
8875PyUnicode_EncodeDecimal(Py_UNICODE *s,
8876 Py_ssize_t length,
8877 char *output,
8878 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00008879{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008880 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01008881 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01008882 enum PyUnicode_Kind kind;
8883 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008884
8885 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008886 PyErr_BadArgument();
8887 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008888 }
8889
Victor Stinner42bf7752011-11-21 22:52:58 +01008890 unicode = PyUnicode_FromUnicode(s, length);
8891 if (unicode == NULL)
8892 return -1;
8893
Benjamin Petersonbac79492012-01-14 13:34:47 -05008894 if (PyUnicode_READY(unicode) == -1) {
Victor Stinner6345be92011-11-25 20:09:01 +01008895 Py_DECREF(unicode);
8896 return -1;
8897 }
Victor Stinner42bf7752011-11-21 22:52:58 +01008898 kind = PyUnicode_KIND(unicode);
8899 data = PyUnicode_DATA(unicode);
8900
Victor Stinnerb84d7232011-11-22 01:50:07 +01008901 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01008902 PyObject *exc;
8903 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00008904 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01008905 Py_ssize_t startpos;
8906
8907 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00008908
Benjamin Peterson29060642009-01-31 22:14:21 +00008909 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008910 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01008911 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008912 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008913 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008914 decimal = Py_UNICODE_TODECIMAL(ch);
8915 if (decimal >= 0) {
8916 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008917 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008918 continue;
8919 }
8920 if (0 < ch && ch < 256) {
8921 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008922 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008923 continue;
8924 }
Victor Stinner6345be92011-11-25 20:09:01 +01008925
Victor Stinner42bf7752011-11-21 22:52:58 +01008926 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01008927 exc = NULL;
8928 raise_encode_exception(&exc, "decimal", unicode,
8929 startpos, startpos+1,
8930 "invalid decimal Unicode string");
8931 Py_XDECREF(exc);
8932 Py_DECREF(unicode);
8933 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008934 }
8935 /* 0-terminate the output string */
8936 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01008937 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008938 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008939}
8940
Guido van Rossumd57fd912000-03-10 22:53:23 +00008941/* --- Helpers ------------------------------------------------------------ */
8942
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02008943/* helper macro to fixup start/end slice values */
8944#define ADJUST_INDICES(start, end, len) \
8945 if (end > len) \
8946 end = len; \
8947 else if (end < 0) { \
8948 end += len; \
8949 if (end < 0) \
8950 end = 0; \
8951 } \
8952 if (start < 0) { \
8953 start += len; \
8954 if (start < 0) \
8955 start = 0; \
8956 }
8957
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008958static Py_ssize_t
Victor Stinner794d5672011-10-10 03:21:36 +02008959any_find_slice(int direction, PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008960 Py_ssize_t start,
8961 Py_ssize_t end)
8962{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02008963 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008964 void *buf1, *buf2;
8965 Py_ssize_t len1, len2, result;
8966
8967 kind1 = PyUnicode_KIND(s1);
8968 kind2 = PyUnicode_KIND(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02008969 if (kind1 < kind2)
8970 return -1;
8971
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008972 len1 = PyUnicode_GET_LENGTH(s1);
8973 len2 = PyUnicode_GET_LENGTH(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02008974 ADJUST_INDICES(start, end, len1);
8975 if (end - start < len2)
8976 return -1;
8977
8978 buf1 = PyUnicode_DATA(s1);
8979 buf2 = PyUnicode_DATA(s2);
8980 if (len2 == 1) {
8981 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
8982 result = findchar((const char *)buf1 + kind1*start,
8983 kind1, end - start, ch, direction);
8984 if (result == -1)
8985 return -1;
8986 else
8987 return start + result;
8988 }
8989
8990 if (kind2 != kind1) {
8991 buf2 = _PyUnicode_AsKind(s2, kind1);
8992 if (!buf2)
8993 return -2;
8994 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008995
Victor Stinner794d5672011-10-10 03:21:36 +02008996 if (direction > 0) {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02008997 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02008998 case PyUnicode_1BYTE_KIND:
8999 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9000 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9001 else
9002 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9003 break;
9004 case PyUnicode_2BYTE_KIND:
9005 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9006 break;
9007 case PyUnicode_4BYTE_KIND:
9008 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9009 break;
9010 default:
9011 assert(0); result = -2;
9012 }
9013 }
9014 else {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009015 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009016 case PyUnicode_1BYTE_KIND:
9017 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9018 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9019 else
9020 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9021 break;
9022 case PyUnicode_2BYTE_KIND:
9023 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9024 break;
9025 case PyUnicode_4BYTE_KIND:
9026 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9027 break;
9028 default:
9029 assert(0); result = -2;
9030 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009031 }
9032
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009033 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009034 PyMem_Free(buf2);
9035
9036 return result;
9037}
9038
9039Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01009040_PyUnicode_InsertThousandsGrouping(
9041 PyObject *unicode, Py_ssize_t index,
9042 Py_ssize_t n_buffer,
9043 void *digits, Py_ssize_t n_digits,
9044 Py_ssize_t min_width,
9045 const char *grouping, PyObject *thousands_sep,
9046 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009047{
Victor Stinner41a863c2012-02-24 00:37:51 +01009048 unsigned int kind, thousands_sep_kind;
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009049 char *data, *thousands_sep_data;
Victor Stinner41a863c2012-02-24 00:37:51 +01009050 Py_ssize_t thousands_sep_len;
9051 Py_ssize_t len;
9052
9053 if (unicode != NULL) {
9054 kind = PyUnicode_KIND(unicode);
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009055 data = (char *) PyUnicode_DATA(unicode) + index * kind;
Victor Stinner41a863c2012-02-24 00:37:51 +01009056 }
9057 else {
9058 kind = PyUnicode_1BYTE_KIND;
9059 data = NULL;
9060 }
9061 thousands_sep_kind = PyUnicode_KIND(thousands_sep);
9062 thousands_sep_data = PyUnicode_DATA(thousands_sep);
9063 thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9064 if (unicode != NULL && thousands_sep_kind != kind) {
Victor Stinner90f50d42012-02-24 01:44:47 +01009065 if (thousands_sep_kind < kind) {
9066 thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind);
9067 if (!thousands_sep_data)
9068 return -1;
9069 }
9070 else {
9071 data = _PyUnicode_AsKind(unicode, thousands_sep_kind);
9072 if (!data)
9073 return -1;
9074 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009075 }
9076
Benjamin Petersonead6b532011-12-20 17:23:42 -06009077 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009078 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009079 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
Victor Stinner41a863c2012-02-24 00:37:51 +01009080 len = asciilib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009081 (Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009082 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009083 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009084 else
Victor Stinner41a863c2012-02-24 00:37:51 +01009085 len = ucs1lib_InsertThousandsGrouping(
Victor Stinnerc3cec782011-10-05 21:24:08 +02009086 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009087 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009088 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009089 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009090 case PyUnicode_2BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009091 len = ucs2lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009092 (Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009093 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009094 (Py_UCS2 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009095 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009096 case PyUnicode_4BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009097 len = ucs4lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009098 (Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009099 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009100 (Py_UCS4 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009101 break;
9102 default:
9103 assert(0);
9104 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009105 }
Victor Stinner90f50d42012-02-24 01:44:47 +01009106 if (unicode != NULL && thousands_sep_kind != kind) {
9107 if (thousands_sep_kind < kind)
9108 PyMem_Free(thousands_sep_data);
9109 else
9110 PyMem_Free(data);
9111 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009112 if (unicode == NULL) {
9113 *maxchar = 127;
9114 if (len != n_digits) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009115 *maxchar = Py_MAX(*maxchar,
Victor Stinnere6abb482012-05-02 01:15:40 +02009116 PyUnicode_MAX_CHAR_VALUE(thousands_sep));
Victor Stinner41a863c2012-02-24 00:37:51 +01009117 }
9118 }
9119 return len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009120}
9121
9122
Alexander Belopolsky40018472011-02-26 01:02:56 +00009123Py_ssize_t
9124PyUnicode_Count(PyObject *str,
9125 PyObject *substr,
9126 Py_ssize_t start,
9127 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009128{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009129 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009130 PyObject* str_obj;
9131 PyObject* sub_obj;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009132 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009133 void *buf1 = NULL, *buf2 = NULL;
9134 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009135
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009136 str_obj = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009137 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00009138 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009139 sub_obj = PyUnicode_FromObject(substr);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009140 if (!sub_obj) {
9141 Py_DECREF(str_obj);
9142 return -1;
9143 }
Benjamin Peterson4c13a4a2012-01-02 09:07:38 -06009144 if (PyUnicode_READY(sub_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
Benjamin Peterson5e458f52012-01-02 10:12:13 -06009145 Py_DECREF(sub_obj);
Benjamin Peterson29060642009-01-31 22:14:21 +00009146 Py_DECREF(str_obj);
9147 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009148 }
Tim Petersced69f82003-09-16 20:30:58 +00009149
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009150 kind1 = PyUnicode_KIND(str_obj);
9151 kind2 = PyUnicode_KIND(sub_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009152 if (kind1 < kind2) {
9153 Py_DECREF(sub_obj);
9154 Py_DECREF(str_obj);
9155 return 0;
Benjamin Peterson1ff2e352012-05-11 17:41:20 -05009156 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009157
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009158 len1 = PyUnicode_GET_LENGTH(str_obj);
9159 len2 = PyUnicode_GET_LENGTH(sub_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009160 ADJUST_INDICES(start, end, len1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009161 if (end - start < len2) {
9162 Py_DECREF(sub_obj);
9163 Py_DECREF(str_obj);
9164 return 0;
9165 }
9166
9167 buf1 = PyUnicode_DATA(str_obj);
9168 buf2 = PyUnicode_DATA(sub_obj);
9169 if (kind2 != kind1) {
9170 buf2 = _PyUnicode_AsKind(sub_obj, kind1);
9171 if (!buf2)
9172 goto onError;
9173 }
9174
9175 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009176 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009177 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
9178 result = asciilib_count(
9179 ((Py_UCS1*)buf1) + start, end - start,
9180 buf2, len2, PY_SSIZE_T_MAX
9181 );
9182 else
9183 result = ucs1lib_count(
9184 ((Py_UCS1*)buf1) + start, end - start,
9185 buf2, len2, PY_SSIZE_T_MAX
9186 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009187 break;
9188 case PyUnicode_2BYTE_KIND:
9189 result = ucs2lib_count(
9190 ((Py_UCS2*)buf1) + start, end - start,
9191 buf2, len2, PY_SSIZE_T_MAX
9192 );
9193 break;
9194 case PyUnicode_4BYTE_KIND:
9195 result = ucs4lib_count(
9196 ((Py_UCS4*)buf1) + start, end - start,
9197 buf2, len2, PY_SSIZE_T_MAX
9198 );
9199 break;
9200 default:
9201 assert(0); result = 0;
9202 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009203
9204 Py_DECREF(sub_obj);
9205 Py_DECREF(str_obj);
9206
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009207 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009208 PyMem_Free(buf2);
9209
Guido van Rossumd57fd912000-03-10 22:53:23 +00009210 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009211 onError:
9212 Py_DECREF(sub_obj);
9213 Py_DECREF(str_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009214 if (kind2 != kind1 && buf2)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009215 PyMem_Free(buf2);
9216 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009217}
9218
Alexander Belopolsky40018472011-02-26 01:02:56 +00009219Py_ssize_t
9220PyUnicode_Find(PyObject *str,
9221 PyObject *sub,
9222 Py_ssize_t start,
9223 Py_ssize_t end,
9224 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009225{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009226 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009227
Guido van Rossumd57fd912000-03-10 22:53:23 +00009228 str = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009229 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00009230 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009231 sub = PyUnicode_FromObject(sub);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009232 if (!sub) {
9233 Py_DECREF(str);
9234 return -2;
9235 }
9236 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
9237 Py_DECREF(sub);
Benjamin Peterson29060642009-01-31 22:14:21 +00009238 Py_DECREF(str);
9239 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009240 }
Tim Petersced69f82003-09-16 20:30:58 +00009241
Victor Stinner794d5672011-10-10 03:21:36 +02009242 result = any_find_slice(direction,
9243 str, sub, start, end
9244 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00009245
Guido van Rossumd57fd912000-03-10 22:53:23 +00009246 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009247 Py_DECREF(sub);
9248
Guido van Rossumd57fd912000-03-10 22:53:23 +00009249 return result;
9250}
9251
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009252Py_ssize_t
9253PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9254 Py_ssize_t start, Py_ssize_t end,
9255 int direction)
9256{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009257 int kind;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009258 Py_ssize_t result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009259 if (PyUnicode_READY(str) == -1)
9260 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02009261 if (start < 0 || end < 0) {
9262 PyErr_SetString(PyExc_IndexError, "string index out of range");
9263 return -2;
9264 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009265 if (end > PyUnicode_GET_LENGTH(str))
9266 end = PyUnicode_GET_LENGTH(str);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009267 if (start >= end)
9268 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009269 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009270 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9271 kind, end-start, ch, direction);
9272 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009273 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009274 else
9275 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009276}
9277
Alexander Belopolsky40018472011-02-26 01:02:56 +00009278static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009279tailmatch(PyObject *self,
9280 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009281 Py_ssize_t start,
9282 Py_ssize_t end,
9283 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009284{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009285 int kind_self;
9286 int kind_sub;
9287 void *data_self;
9288 void *data_sub;
9289 Py_ssize_t offset;
9290 Py_ssize_t i;
9291 Py_ssize_t end_sub;
9292
9293 if (PyUnicode_READY(self) == -1 ||
9294 PyUnicode_READY(substring) == -1)
Victor Stinner18aa4472013-01-03 03:18:09 +01009295 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009296
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009297 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9298 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009299 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009300 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009301
Serhiy Storchakad4ea03c2015-05-31 09:15:51 +03009302 if (PyUnicode_GET_LENGTH(substring) == 0)
9303 return 1;
9304
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009305 kind_self = PyUnicode_KIND(self);
9306 data_self = PyUnicode_DATA(self);
9307 kind_sub = PyUnicode_KIND(substring);
9308 data_sub = PyUnicode_DATA(substring);
9309 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9310
9311 if (direction > 0)
9312 offset = end;
9313 else
9314 offset = start;
9315
9316 if (PyUnicode_READ(kind_self, data_self, offset) ==
9317 PyUnicode_READ(kind_sub, data_sub, 0) &&
9318 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9319 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9320 /* If both are of the same kind, memcmp is sufficient */
9321 if (kind_self == kind_sub) {
9322 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009323 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009324 data_sub,
9325 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009326 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009327 }
Martin Pantere26da7c2016-06-02 10:07:09 +00009328 /* otherwise we have to compare each character by first accessing it */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009329 else {
9330 /* We do not need to compare 0 and len(substring)-1 because
9331 the if statement above ensured already that they are equal
9332 when we end up here. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009333 for (i = 1; i < end_sub; ++i) {
9334 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9335 PyUnicode_READ(kind_sub, data_sub, i))
9336 return 0;
9337 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009338 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009339 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009340 }
9341
9342 return 0;
9343}
9344
Alexander Belopolsky40018472011-02-26 01:02:56 +00009345Py_ssize_t
9346PyUnicode_Tailmatch(PyObject *str,
9347 PyObject *substr,
9348 Py_ssize_t start,
9349 Py_ssize_t end,
9350 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009351{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009352 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009353
Guido van Rossumd57fd912000-03-10 22:53:23 +00009354 str = PyUnicode_FromObject(str);
9355 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009356 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009357 substr = PyUnicode_FromObject(substr);
9358 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009359 Py_DECREF(str);
9360 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009361 }
Tim Petersced69f82003-09-16 20:30:58 +00009362
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009363 result = tailmatch(str, substr,
Benjamin Peterson29060642009-01-31 22:14:21 +00009364 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009365 Py_DECREF(str);
9366 Py_DECREF(substr);
9367 return result;
9368}
9369
Guido van Rossumd57fd912000-03-10 22:53:23 +00009370/* Apply fixfct filter to the Unicode object self and return a
9371 reference to the modified object */
9372
Alexander Belopolsky40018472011-02-26 01:02:56 +00009373static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009374fixup(PyObject *self,
9375 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009376{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009377 PyObject *u;
9378 Py_UCS4 maxchar_old, maxchar_new = 0;
Victor Stinnereaab6042011-12-11 22:22:39 +01009379 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009380
Victor Stinnerbf6e5602011-12-12 01:53:47 +01009381 u = _PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009382 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009383 return NULL;
Victor Stinner87af4f22011-11-21 23:03:47 +01009384 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009385
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009386 /* fix functions return the new maximum character in a string,
9387 if the kind of the resulting unicode object does not change,
9388 everything is fine. Otherwise we need to change the string kind
9389 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009390 maxchar_new = fixfct(u);
Victor Stinnereaab6042011-12-11 22:22:39 +01009391
9392 if (maxchar_new == 0) {
9393 /* no changes */;
9394 if (PyUnicode_CheckExact(self)) {
9395 Py_DECREF(u);
9396 Py_INCREF(self);
9397 return self;
9398 }
9399 else
9400 return u;
9401 }
9402
Victor Stinnere6abb482012-05-02 01:15:40 +02009403 maxchar_new = align_maxchar(maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009404
Victor Stinnereaab6042011-12-11 22:22:39 +01009405 if (maxchar_new == maxchar_old)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009406 return u;
Victor Stinnereaab6042011-12-11 22:22:39 +01009407
9408 /* In case the maximum character changed, we need to
9409 convert the string to the new category. */
9410 v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
9411 if (v == NULL) {
9412 Py_DECREF(u);
9413 return NULL;
9414 }
9415 if (maxchar_new > maxchar_old) {
9416 /* If the maxchar increased so that the kind changed, not all
9417 characters are representable anymore and we need to fix the
9418 string again. This only happens in very few cases. */
Victor Stinnerd3f08822012-05-29 12:57:52 +02009419 _PyUnicode_FastCopyCharacters(v, 0,
9420 self, 0, PyUnicode_GET_LENGTH(self));
Victor Stinnereaab6042011-12-11 22:22:39 +01009421 maxchar_old = fixfct(v);
9422 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009423 }
9424 else {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009425 _PyUnicode_FastCopyCharacters(v, 0,
9426 u, 0, PyUnicode_GET_LENGTH(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009427 }
Victor Stinnereaab6042011-12-11 22:22:39 +01009428 Py_DECREF(u);
9429 assert(_PyUnicode_CheckConsistency(v, 1));
9430 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009431}
9432
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009433static PyObject *
9434ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009435{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009436 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9437 char *resdata, *data = PyUnicode_DATA(self);
9438 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009439
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009440 res = PyUnicode_New(len, 127);
9441 if (res == NULL)
9442 return NULL;
9443 resdata = PyUnicode_DATA(res);
9444 if (lower)
9445 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009446 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009447 _Py_bytes_upper(resdata, data, len);
9448 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009449}
9450
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009451static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009452handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009453{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009454 Py_ssize_t j;
9455 int final_sigma;
Victor Stinner0c39b1b2015-03-18 15:02:06 +01009456 Py_UCS4 c = 0; /* initialize to prevent gcc warning */
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009457 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009458
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009459 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9460
9461 where ! is a negation and \p{xxx} is a character with property xxx.
9462 */
9463 for (j = i - 1; j >= 0; j--) {
9464 c = PyUnicode_READ(kind, data, j);
9465 if (!_PyUnicode_IsCaseIgnorable(c))
9466 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009467 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009468 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9469 if (final_sigma) {
9470 for (j = i + 1; j < length; j++) {
9471 c = PyUnicode_READ(kind, data, j);
9472 if (!_PyUnicode_IsCaseIgnorable(c))
9473 break;
9474 }
9475 final_sigma = j == length || !_PyUnicode_IsCased(c);
9476 }
9477 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009478}
9479
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009480static int
9481lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9482 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009483{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009484 /* Obscure special case. */
9485 if (c == 0x3A3) {
9486 mapped[0] = handle_capital_sigma(kind, data, length, i);
9487 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009488 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009489 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009490}
9491
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009492static Py_ssize_t
9493do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009494{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009495 Py_ssize_t i, k = 0;
9496 int n_res, j;
9497 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009498
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009499 c = PyUnicode_READ(kind, data, 0);
9500 n_res = _PyUnicode_ToUpperFull(c, mapped);
9501 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009502 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009503 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009504 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009505 for (i = 1; i < length; i++) {
9506 c = PyUnicode_READ(kind, data, i);
9507 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9508 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009509 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009510 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009511 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009512 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009513 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009514}
9515
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009516static Py_ssize_t
9517do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9518 Py_ssize_t i, k = 0;
9519
9520 for (i = 0; i < length; i++) {
9521 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9522 int n_res, j;
9523 if (Py_UNICODE_ISUPPER(c)) {
9524 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9525 }
9526 else if (Py_UNICODE_ISLOWER(c)) {
9527 n_res = _PyUnicode_ToUpperFull(c, mapped);
9528 }
9529 else {
9530 n_res = 1;
9531 mapped[0] = c;
9532 }
9533 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009534 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009535 res[k++] = mapped[j];
9536 }
9537 }
9538 return k;
9539}
9540
9541static Py_ssize_t
9542do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9543 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009544{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009545 Py_ssize_t i, k = 0;
9546
9547 for (i = 0; i < length; i++) {
9548 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9549 int n_res, j;
9550 if (lower)
9551 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9552 else
9553 n_res = _PyUnicode_ToUpperFull(c, mapped);
9554 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009555 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009556 res[k++] = mapped[j];
9557 }
9558 }
9559 return k;
9560}
9561
9562static Py_ssize_t
9563do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9564{
9565 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9566}
9567
9568static Py_ssize_t
9569do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9570{
9571 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9572}
9573
Benjamin Petersone51757f2012-01-12 21:10:29 -05009574static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009575do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9576{
9577 Py_ssize_t i, k = 0;
9578
9579 for (i = 0; i < length; i++) {
9580 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9581 Py_UCS4 mapped[3];
9582 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9583 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009584 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -05009585 res[k++] = mapped[j];
9586 }
9587 }
9588 return k;
9589}
9590
9591static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009592do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9593{
9594 Py_ssize_t i, k = 0;
9595 int previous_is_cased;
9596
9597 previous_is_cased = 0;
9598 for (i = 0; i < length; i++) {
9599 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9600 Py_UCS4 mapped[3];
9601 int n_res, j;
9602
9603 if (previous_is_cased)
9604 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9605 else
9606 n_res = _PyUnicode_ToTitleFull(c, mapped);
9607
9608 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009609 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -05009610 res[k++] = mapped[j];
9611 }
9612
9613 previous_is_cased = _PyUnicode_IsCased(c);
9614 }
9615 return k;
9616}
9617
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009618static PyObject *
9619case_operation(PyObject *self,
9620 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9621{
9622 PyObject *res = NULL;
9623 Py_ssize_t length, newlength = 0;
9624 int kind, outkind;
9625 void *data, *outdata;
9626 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9627
Benjamin Petersoneea48462012-01-16 14:28:50 -05009628 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009629
9630 kind = PyUnicode_KIND(self);
9631 data = PyUnicode_DATA(self);
9632 length = PyUnicode_GET_LENGTH(self);
Antoine Pitrou4e334242014-10-15 23:14:53 +02009633 if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
Benjamin Petersone1bd38c2014-10-15 11:47:36 -04009634 PyErr_SetString(PyExc_OverflowError, "string is too long");
9635 return NULL;
9636 }
Benjamin Peterson1e211ff2014-10-15 12:17:21 -04009637 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009638 if (tmp == NULL)
9639 return PyErr_NoMemory();
9640 newlength = perform(kind, data, length, tmp, &maxchar);
9641 res = PyUnicode_New(newlength, maxchar);
9642 if (res == NULL)
9643 goto leave;
9644 tmpend = tmp + newlength;
9645 outdata = PyUnicode_DATA(res);
9646 outkind = PyUnicode_KIND(res);
9647 switch (outkind) {
9648 case PyUnicode_1BYTE_KIND:
9649 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9650 break;
9651 case PyUnicode_2BYTE_KIND:
9652 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9653 break;
9654 case PyUnicode_4BYTE_KIND:
9655 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9656 break;
9657 default:
9658 assert(0);
9659 break;
9660 }
9661 leave:
9662 PyMem_FREE(tmp);
9663 return res;
9664}
9665
Tim Peters8ce9f162004-08-27 01:49:32 +00009666PyObject *
9667PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009668{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009669 PyObject *sep = NULL;
Victor Stinnerdd077322011-10-07 17:02:31 +02009670 Py_ssize_t seplen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009671 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00009672 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009673 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
9674 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00009675 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009676 Py_ssize_t sz, i, res_offset;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009677 Py_UCS4 maxchar;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009678 Py_UCS4 item_maxchar;
Victor Stinnerdd077322011-10-07 17:02:31 +02009679 int use_memcpy;
9680 unsigned char *res_data = NULL, *sep_data = NULL;
9681 PyObject *last_obj;
9682 unsigned int kind = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009683
Benjamin Peterson9743b2c2014-02-15 13:02:52 -05009684 fseq = PySequence_Fast(seq, "can only join an iterable");
Tim Peters05eba1f2004-08-27 21:32:02 +00009685 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009686 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009687 }
9688
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009689 /* NOTE: the following code can't call back into Python code,
9690 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009691 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009692
Tim Peters05eba1f2004-08-27 21:32:02 +00009693 seqlen = PySequence_Fast_GET_SIZE(fseq);
9694 /* If empty sequence, return u"". */
9695 if (seqlen == 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009696 Py_DECREF(fseq);
Serhiy Storchaka678db842013-01-26 12:16:36 +02009697 _Py_RETURN_UNICODE_EMPTY();
Tim Peters05eba1f2004-08-27 21:32:02 +00009698 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009699
Tim Peters05eba1f2004-08-27 21:32:02 +00009700 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009701 last_obj = NULL;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009702 items = PySequence_Fast_ITEMS(fseq);
Victor Stinneracf47b82011-10-06 12:32:37 +02009703 if (seqlen == 1) {
9704 if (PyUnicode_CheckExact(items[0])) {
9705 res = items[0];
9706 Py_INCREF(res);
9707 Py_DECREF(fseq);
9708 return res;
9709 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009710 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009711 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009712 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009713 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009714 /* Set up sep and seplen */
9715 if (separator == NULL) {
9716 /* fall back to a blank space separator */
9717 sep = PyUnicode_FromOrdinal(' ');
9718 if (!sep)
9719 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009720 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009721 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009722 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009723 else {
9724 if (!PyUnicode_Check(separator)) {
9725 PyErr_Format(PyExc_TypeError,
9726 "separator: expected str instance,"
9727 " %.80s found",
9728 Py_TYPE(separator)->tp_name);
9729 goto onError;
9730 }
9731 if (PyUnicode_READY(separator))
9732 goto onError;
9733 sep = separator;
9734 seplen = PyUnicode_GET_LENGTH(separator);
9735 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9736 /* inc refcount to keep this code path symmetric with the
9737 above case of a blank separator */
9738 Py_INCREF(sep);
9739 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009740 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009741 }
9742
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009743 /* There are at least two things to join, or else we have a subclass
9744 * of str in the sequence.
9745 * Do a pre-pass to figure out the total amount of space we'll
9746 * need (sz), and see whether all argument are strings.
9747 */
9748 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009749#ifdef Py_DEBUG
9750 use_memcpy = 0;
9751#else
9752 use_memcpy = 1;
9753#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009754 for (i = 0; i < seqlen; i++) {
9755 const Py_ssize_t old_sz = sz;
9756 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009757 if (!PyUnicode_Check(item)) {
9758 PyErr_Format(PyExc_TypeError,
Victor Stinnera33bce02014-07-04 22:47:46 +02009759 "sequence item %zd: expected str instance,"
Benjamin Peterson29060642009-01-31 22:14:21 +00009760 " %.80s found",
9761 i, Py_TYPE(item)->tp_name);
9762 goto onError;
9763 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009764 if (PyUnicode_READY(item) == -1)
9765 goto onError;
9766 sz += PyUnicode_GET_LENGTH(item);
9767 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Benjamin Peterson7e303732013-06-10 09:19:46 -07009768 maxchar = Py_MAX(maxchar, item_maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009769 if (i != 0)
9770 sz += seplen;
9771 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9772 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009773 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009774 goto onError;
9775 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009776 if (use_memcpy && last_obj != NULL) {
9777 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9778 use_memcpy = 0;
9779 }
9780 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009781 }
Tim Petersced69f82003-09-16 20:30:58 +00009782
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009783 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009784 if (res == NULL)
9785 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009786
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009787 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009788#ifdef Py_DEBUG
9789 use_memcpy = 0;
9790#else
9791 if (use_memcpy) {
9792 res_data = PyUnicode_1BYTE_DATA(res);
9793 kind = PyUnicode_KIND(res);
9794 if (seplen != 0)
9795 sep_data = PyUnicode_1BYTE_DATA(sep);
9796 }
9797#endif
Victor Stinner4560f9c2013-04-14 18:56:46 +02009798 if (use_memcpy) {
9799 for (i = 0; i < seqlen; ++i) {
9800 Py_ssize_t itemlen;
9801 item = items[i];
9802
9803 /* Copy item, and maybe the separator. */
9804 if (i && seplen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009805 Py_MEMCPY(res_data,
9806 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009807 kind * seplen);
9808 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009809 }
Victor Stinner4560f9c2013-04-14 18:56:46 +02009810
9811 itemlen = PyUnicode_GET_LENGTH(item);
9812 if (itemlen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009813 Py_MEMCPY(res_data,
9814 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009815 kind * itemlen);
9816 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009817 }
Victor Stinner4560f9c2013-04-14 18:56:46 +02009818 }
9819 assert(res_data == PyUnicode_1BYTE_DATA(res)
9820 + kind * PyUnicode_GET_LENGTH(res));
9821 }
9822 else {
9823 for (i = 0, res_offset = 0; i < seqlen; ++i) {
9824 Py_ssize_t itemlen;
9825 item = items[i];
9826
9827 /* Copy item, and maybe the separator. */
9828 if (i && seplen != 0) {
9829 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
9830 res_offset += seplen;
9831 }
9832
9833 itemlen = PyUnicode_GET_LENGTH(item);
9834 if (itemlen != 0) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009835 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +02009836 res_offset += itemlen;
9837 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009838 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009839 assert(res_offset == PyUnicode_GET_LENGTH(res));
Victor Stinner4560f9c2013-04-14 18:56:46 +02009840 }
Tim Peters8ce9f162004-08-27 01:49:32 +00009841
Tim Peters05eba1f2004-08-27 21:32:02 +00009842 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009843 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009844 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009845 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009846
Benjamin Peterson29060642009-01-31 22:14:21 +00009847 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00009848 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009849 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00009850 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009851 return NULL;
9852}
9853
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009854#define FILL(kind, data, value, start, length) \
9855 do { \
9856 Py_ssize_t i_ = 0; \
9857 assert(kind != PyUnicode_WCHAR_KIND); \
9858 switch ((kind)) { \
9859 case PyUnicode_1BYTE_KIND: { \
9860 unsigned char * to_ = (unsigned char *)((data)) + (start); \
Victor Stinnerf2c76aa2012-05-03 13:10:40 +02009861 memset(to_, (unsigned char)value, (length)); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009862 break; \
9863 } \
9864 case PyUnicode_2BYTE_KIND: { \
9865 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
9866 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9867 break; \
9868 } \
Benjamin Petersone157cf12012-01-01 15:56:20 -06009869 case PyUnicode_4BYTE_KIND: { \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009870 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9871 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9872 break; \
9873 } \
Serhiy Storchaka133b11b2014-12-01 18:56:28 +02009874 default: assert(0); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009875 } \
9876 } while (0)
9877
Victor Stinnerd3f08822012-05-29 12:57:52 +02009878void
9879_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9880 Py_UCS4 fill_char)
9881{
9882 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
9883 const void *data = PyUnicode_DATA(unicode);
9884 assert(PyUnicode_IS_READY(unicode));
9885 assert(unicode_modifiable(unicode));
9886 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
9887 assert(start >= 0);
9888 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
9889 FILL(kind, data, fill_char, start, length);
9890}
9891
Victor Stinner3fe55312012-01-04 00:33:50 +01009892Py_ssize_t
9893PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9894 Py_UCS4 fill_char)
9895{
9896 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +01009897
9898 if (!PyUnicode_Check(unicode)) {
9899 PyErr_BadInternalCall();
9900 return -1;
9901 }
9902 if (PyUnicode_READY(unicode) == -1)
9903 return -1;
9904 if (unicode_check_modifiable(unicode))
9905 return -1;
9906
Victor Stinnerd3f08822012-05-29 12:57:52 +02009907 if (start < 0) {
9908 PyErr_SetString(PyExc_IndexError, "string index out of range");
9909 return -1;
9910 }
Victor Stinner3fe55312012-01-04 00:33:50 +01009911 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
9912 PyErr_SetString(PyExc_ValueError,
9913 "fill character is bigger than "
9914 "the string maximum character");
9915 return -1;
9916 }
9917
9918 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
9919 length = Py_MIN(maxlen, length);
9920 if (length <= 0)
9921 return 0;
9922
Victor Stinnerd3f08822012-05-29 12:57:52 +02009923 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +01009924 return length;
9925}
9926
Victor Stinner9310abb2011-10-05 00:59:23 +02009927static PyObject *
9928pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009929 Py_ssize_t left,
9930 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009931 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009932{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009933 PyObject *u;
9934 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009935 int kind;
9936 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009937
9938 if (left < 0)
9939 left = 0;
9940 if (right < 0)
9941 right = 0;
9942
Victor Stinnerc4b49542011-12-11 22:44:26 +01009943 if (left == 0 && right == 0)
9944 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009945
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009946 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9947 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00009948 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9949 return NULL;
9950 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009951 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Benjamin Peterson7e303732013-06-10 09:19:46 -07009952 maxchar = Py_MAX(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009953 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009954 if (!u)
9955 return NULL;
9956
9957 kind = PyUnicode_KIND(u);
9958 data = PyUnicode_DATA(u);
9959 if (left)
9960 FILL(kind, data, fill, 0, left);
9961 if (right)
9962 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +02009963 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009964 assert(_PyUnicode_CheckConsistency(u, 1));
9965 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009966}
9967
Alexander Belopolsky40018472011-02-26 01:02:56 +00009968PyObject *
9969PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009970{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009971 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009972
9973 string = PyUnicode_FromObject(string);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009974 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009975 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -06009976 if (PyUnicode_READY(string) == -1) {
9977 Py_DECREF(string);
9978 return NULL;
9979 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009980
Benjamin Petersonead6b532011-12-20 17:23:42 -06009981 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009982 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009983 if (PyUnicode_IS_ASCII(string))
9984 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009985 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009986 PyUnicode_GET_LENGTH(string), keepends);
9987 else
9988 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009989 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009990 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009991 break;
9992 case PyUnicode_2BYTE_KIND:
9993 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009994 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009995 PyUnicode_GET_LENGTH(string), keepends);
9996 break;
9997 case PyUnicode_4BYTE_KIND:
9998 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009999 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010000 PyUnicode_GET_LENGTH(string), keepends);
10001 break;
10002 default:
10003 assert(0);
10004 list = 0;
10005 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010006 Py_DECREF(string);
10007 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010008}
10009
Alexander Belopolsky40018472011-02-26 01:02:56 +000010010static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010011split(PyObject *self,
10012 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010013 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010014{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010015 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010016 void *buf1, *buf2;
10017 Py_ssize_t len1, len2;
10018 PyObject* out;
10019
Guido van Rossumd57fd912000-03-10 22:53:23 +000010020 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010021 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010022
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010023 if (PyUnicode_READY(self) == -1)
10024 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010025
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010026 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010027 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010028 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010029 if (PyUnicode_IS_ASCII(self))
10030 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010031 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010032 PyUnicode_GET_LENGTH(self), maxcount
10033 );
10034 else
10035 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010036 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010037 PyUnicode_GET_LENGTH(self), maxcount
10038 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010039 case PyUnicode_2BYTE_KIND:
10040 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010041 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010042 PyUnicode_GET_LENGTH(self), maxcount
10043 );
10044 case PyUnicode_4BYTE_KIND:
10045 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010046 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010047 PyUnicode_GET_LENGTH(self), maxcount
10048 );
10049 default:
10050 assert(0);
10051 return NULL;
10052 }
10053
10054 if (PyUnicode_READY(substring) == -1)
10055 return NULL;
10056
10057 kind1 = PyUnicode_KIND(self);
10058 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010059 len1 = PyUnicode_GET_LENGTH(self);
10060 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010061 if (kind1 < kind2 || len1 < len2) {
10062 out = PyList_New(1);
10063 if (out == NULL)
10064 return NULL;
10065 Py_INCREF(self);
10066 PyList_SET_ITEM(out, 0, self);
10067 return out;
10068 }
10069 buf1 = PyUnicode_DATA(self);
10070 buf2 = PyUnicode_DATA(substring);
10071 if (kind2 != kind1) {
10072 buf2 = _PyUnicode_AsKind(substring, kind1);
10073 if (!buf2)
10074 return NULL;
10075 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010076
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010077 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010078 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010079 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10080 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010081 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010082 else
10083 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010084 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010085 break;
10086 case PyUnicode_2BYTE_KIND:
10087 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010088 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010089 break;
10090 case PyUnicode_4BYTE_KIND:
10091 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010092 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010093 break;
10094 default:
10095 out = NULL;
10096 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010097 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010098 PyMem_Free(buf2);
10099 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010100}
10101
Alexander Belopolsky40018472011-02-26 01:02:56 +000010102static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010103rsplit(PyObject *self,
10104 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010105 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010106{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010107 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010108 void *buf1, *buf2;
10109 Py_ssize_t len1, len2;
10110 PyObject* out;
10111
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010112 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010113 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010114
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010115 if (PyUnicode_READY(self) == -1)
10116 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010117
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010118 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010119 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010120 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010121 if (PyUnicode_IS_ASCII(self))
10122 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010123 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010124 PyUnicode_GET_LENGTH(self), maxcount
10125 );
10126 else
10127 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010128 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010129 PyUnicode_GET_LENGTH(self), maxcount
10130 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010131 case PyUnicode_2BYTE_KIND:
10132 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010133 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010134 PyUnicode_GET_LENGTH(self), maxcount
10135 );
10136 case PyUnicode_4BYTE_KIND:
10137 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010138 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010139 PyUnicode_GET_LENGTH(self), maxcount
10140 );
10141 default:
10142 assert(0);
10143 return NULL;
10144 }
10145
10146 if (PyUnicode_READY(substring) == -1)
10147 return NULL;
10148
10149 kind1 = PyUnicode_KIND(self);
10150 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010151 len1 = PyUnicode_GET_LENGTH(self);
10152 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010153 if (kind1 < kind2 || len1 < len2) {
10154 out = PyList_New(1);
10155 if (out == NULL)
10156 return NULL;
10157 Py_INCREF(self);
10158 PyList_SET_ITEM(out, 0, self);
10159 return out;
10160 }
10161 buf1 = PyUnicode_DATA(self);
10162 buf2 = PyUnicode_DATA(substring);
10163 if (kind2 != kind1) {
10164 buf2 = _PyUnicode_AsKind(substring, kind1);
10165 if (!buf2)
10166 return NULL;
10167 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010168
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010169 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010170 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010171 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10172 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010173 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010174 else
10175 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010176 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010177 break;
10178 case PyUnicode_2BYTE_KIND:
10179 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010180 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010181 break;
10182 case PyUnicode_4BYTE_KIND:
10183 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010184 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010185 break;
10186 default:
10187 out = NULL;
10188 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010189 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010190 PyMem_Free(buf2);
10191 return out;
10192}
10193
10194static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010195anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10196 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010197{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010198 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010199 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010200 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10201 return asciilib_find(buf1, len1, buf2, len2, offset);
10202 else
10203 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010204 case PyUnicode_2BYTE_KIND:
10205 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10206 case PyUnicode_4BYTE_KIND:
10207 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10208 }
10209 assert(0);
10210 return -1;
10211}
10212
10213static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010214anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10215 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010216{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010217 switch (kind) {
10218 case PyUnicode_1BYTE_KIND:
10219 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10220 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10221 else
10222 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10223 case PyUnicode_2BYTE_KIND:
10224 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10225 case PyUnicode_4BYTE_KIND:
10226 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10227 }
10228 assert(0);
10229 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010230}
10231
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010232static void
10233replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10234 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10235{
10236 int kind = PyUnicode_KIND(u);
10237 void *data = PyUnicode_DATA(u);
10238 Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10239 if (kind == PyUnicode_1BYTE_KIND) {
10240 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10241 (Py_UCS1 *)data + len,
10242 u1, u2, maxcount);
10243 }
10244 else if (kind == PyUnicode_2BYTE_KIND) {
10245 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10246 (Py_UCS2 *)data + len,
10247 u1, u2, maxcount);
10248 }
10249 else {
10250 assert(kind == PyUnicode_4BYTE_KIND);
10251 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10252 (Py_UCS4 *)data + len,
10253 u1, u2, maxcount);
10254 }
10255}
10256
Alexander Belopolsky40018472011-02-26 01:02:56 +000010257static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010258replace(PyObject *self, PyObject *str1,
10259 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010260{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010261 PyObject *u;
10262 char *sbuf = PyUnicode_DATA(self);
10263 char *buf1 = PyUnicode_DATA(str1);
10264 char *buf2 = PyUnicode_DATA(str2);
10265 int srelease = 0, release1 = 0, release2 = 0;
10266 int skind = PyUnicode_KIND(self);
10267 int kind1 = PyUnicode_KIND(str1);
10268 int kind2 = PyUnicode_KIND(str2);
10269 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10270 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10271 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010272 int mayshrink;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010273 Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010274
10275 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010276 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010277 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010278 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010279
Victor Stinner59de0ee2011-10-07 10:01:28 +020010280 if (str1 == str2)
10281 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010282
Victor Stinner49a0a212011-10-12 23:46:10 +020010283 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010284 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10285 if (maxchar < maxchar_str1)
10286 /* substring too wide to be present */
10287 goto nothing;
Victor Stinner49a0a212011-10-12 23:46:10 +020010288 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10289 /* Replacing str1 with str2 may cause a maxchar reduction in the
10290 result string. */
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010291 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010292 maxchar = Py_MAX(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010293
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010294 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010295 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010296 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010297 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010298 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010299 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010300 Py_UCS4 u1, u2;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010301 Py_ssize_t pos;
Victor Stinnerf6441102011-12-18 02:43:08 +010010302
Victor Stinner69ed0f42013-04-09 21:48:24 +020010303 u1 = PyUnicode_READ(kind1, buf1, 0);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010304 pos = findchar(sbuf, skind, slen, u1, 1);
Victor Stinnerf6441102011-12-18 02:43:08 +010010305 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010306 goto nothing;
Victor Stinner69ed0f42013-04-09 21:48:24 +020010307 u2 = PyUnicode_READ(kind2, buf2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010308 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010309 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010310 goto error;
Victor Stinnerf6441102011-12-18 02:43:08 +010010311
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010312 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10313 replace_1char_inplace(u, pos, u1, u2, maxcount);
Victor Stinner49a0a212011-10-12 23:46:10 +020010314 }
10315 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010316 int rkind = skind;
10317 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010318 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010319
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010320 if (kind1 < rkind) {
10321 /* widen substring */
10322 buf1 = _PyUnicode_AsKind(str1, rkind);
10323 if (!buf1) goto error;
10324 release1 = 1;
10325 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010326 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010327 if (i < 0)
10328 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010329 if (rkind > kind2) {
10330 /* widen replacement */
10331 buf2 = _PyUnicode_AsKind(str2, rkind);
10332 if (!buf2) goto error;
10333 release2 = 1;
10334 }
10335 else if (rkind < kind2) {
10336 /* widen self and buf1 */
10337 rkind = kind2;
10338 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010339 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010340 sbuf = _PyUnicode_AsKind(self, rkind);
10341 if (!sbuf) goto error;
10342 srelease = 1;
10343 buf1 = _PyUnicode_AsKind(str1, rkind);
10344 if (!buf1) goto error;
10345 release1 = 1;
10346 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010347 u = PyUnicode_New(slen, maxchar);
10348 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010349 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010350 assert(PyUnicode_KIND(u) == rkind);
10351 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010352
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010353 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010354 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010355 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010356 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010357 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010358 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010359
10360 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010361 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010362 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010363 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010364 if (i == -1)
10365 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010366 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010367 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010368 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010369 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010370 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010371 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010372 }
10373 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010374 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010375 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010376 int rkind = skind;
10377 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010378
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010379 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010380 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010381 buf1 = _PyUnicode_AsKind(str1, rkind);
10382 if (!buf1) goto error;
10383 release1 = 1;
10384 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010385 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010386 if (n == 0)
10387 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010388 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010389 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010390 buf2 = _PyUnicode_AsKind(str2, rkind);
10391 if (!buf2) goto error;
10392 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010393 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010394 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010395 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010396 rkind = kind2;
10397 sbuf = _PyUnicode_AsKind(self, rkind);
10398 if (!sbuf) goto error;
10399 srelease = 1;
10400 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010401 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010402 buf1 = _PyUnicode_AsKind(str1, rkind);
10403 if (!buf1) goto error;
10404 release1 = 1;
10405 }
10406 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10407 PyUnicode_GET_LENGTH(str1))); */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010408 if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010409 PyErr_SetString(PyExc_OverflowError,
10410 "replace string is too long");
10411 goto error;
10412 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010413 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010414 if (new_size == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010415 _Py_INCREF_UNICODE_EMPTY();
10416 if (!unicode_empty)
10417 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010418 u = unicode_empty;
10419 goto done;
10420 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010421 if (new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010422 PyErr_SetString(PyExc_OverflowError,
10423 "replace string is too long");
10424 goto error;
10425 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010426 u = PyUnicode_New(new_size, maxchar);
10427 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010428 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010429 assert(PyUnicode_KIND(u) == rkind);
10430 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010431 ires = i = 0;
10432 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010433 while (n-- > 0) {
10434 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010435 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010436 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010437 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010438 if (j == -1)
10439 break;
10440 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010441 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010442 memcpy(res + rkind * ires,
10443 sbuf + rkind * i,
10444 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010445 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010446 }
10447 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010448 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010449 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010450 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010451 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010452 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010453 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010454 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010455 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010456 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010457 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010458 memcpy(res + rkind * ires,
10459 sbuf + rkind * i,
10460 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010461 }
10462 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010463 /* interleave */
10464 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010465 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010466 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010467 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010468 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010469 if (--n <= 0)
10470 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010471 memcpy(res + rkind * ires,
10472 sbuf + rkind * i,
10473 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010474 ires++;
10475 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010476 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010477 memcpy(res + rkind * ires,
10478 sbuf + rkind * i,
10479 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010480 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010481 }
10482
10483 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010484 unicode_adjust_maxchar(&u);
10485 if (u == NULL)
10486 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010487 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010488
10489 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010490 if (srelease)
10491 PyMem_FREE(sbuf);
10492 if (release1)
10493 PyMem_FREE(buf1);
10494 if (release2)
10495 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010496 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010497 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010498
Benjamin Peterson29060642009-01-31 22:14:21 +000010499 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010500 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010501 if (srelease)
10502 PyMem_FREE(sbuf);
10503 if (release1)
10504 PyMem_FREE(buf1);
10505 if (release2)
10506 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010507 return unicode_result_unchanged(self);
10508
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010509 error:
10510 if (srelease && sbuf)
10511 PyMem_FREE(sbuf);
10512 if (release1 && buf1)
10513 PyMem_FREE(buf1);
10514 if (release2 && buf2)
10515 PyMem_FREE(buf2);
10516 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010517}
10518
10519/* --- Unicode Object Methods --------------------------------------------- */
10520
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010521PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010522 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010523\n\
10524Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010525characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010526
10527static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010528unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010529{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010530 if (PyUnicode_READY(self) == -1)
10531 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010532 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010533}
10534
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010535PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010536 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010537\n\
10538Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +000010539have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010540
10541static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010542unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010543{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010544 if (PyUnicode_READY(self) == -1)
10545 return NULL;
10546 if (PyUnicode_GET_LENGTH(self) == 0)
10547 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010548 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010549}
10550
Benjamin Petersond5890c82012-01-14 13:23:30 -050010551PyDoc_STRVAR(casefold__doc__,
10552 "S.casefold() -> str\n\
10553\n\
10554Return a version of S suitable for caseless comparisons.");
10555
10556static PyObject *
10557unicode_casefold(PyObject *self)
10558{
10559 if (PyUnicode_READY(self) == -1)
10560 return NULL;
10561 if (PyUnicode_IS_ASCII(self))
10562 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010563 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010564}
10565
10566
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010567/* Argument converter. Coerces to a single unicode character */
10568
10569static int
10570convert_uc(PyObject *obj, void *addr)
10571{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010572 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010573 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010574
Benjamin Peterson14339b62009-01-31 16:36:08 +000010575 uniobj = PyUnicode_FromObject(obj);
10576 if (uniobj == NULL) {
10577 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010578 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010579 return 0;
10580 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010581 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010582 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010583 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010584 Py_DECREF(uniobj);
10585 return 0;
10586 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010587 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010588 Py_DECREF(uniobj);
10589 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010590}
10591
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010592PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010593 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010594\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010595Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010596done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010597
10598static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010599unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010600{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010601 Py_ssize_t marg, left;
10602 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010603 Py_UCS4 fillchar = ' ';
10604
Victor Stinnere9a29352011-10-01 02:14:59 +020010605 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010606 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010607
Benjamin Petersonbac79492012-01-14 13:34:47 -050010608 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010609 return NULL;
10610
Victor Stinnerc4b49542011-12-11 22:44:26 +010010611 if (PyUnicode_GET_LENGTH(self) >= width)
10612 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010613
Victor Stinnerc4b49542011-12-11 22:44:26 +010010614 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010615 left = marg / 2 + (marg & width & 1);
10616
Victor Stinner9310abb2011-10-05 00:59:23 +020010617 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010618}
10619
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010620/* This function assumes that str1 and str2 are readied by the caller. */
10621
Marc-André Lemburge5034372000-08-08 08:04:29 +000010622static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010623unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010624{
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010625#define COMPARE(TYPE1, TYPE2) \
10626 do { \
10627 TYPE1* p1 = (TYPE1 *)data1; \
10628 TYPE2* p2 = (TYPE2 *)data2; \
10629 TYPE1* end = p1 + len; \
10630 Py_UCS4 c1, c2; \
10631 for (; p1 != end; p1++, p2++) { \
10632 c1 = *p1; \
10633 c2 = *p2; \
10634 if (c1 != c2) \
10635 return (c1 < c2) ? -1 : 1; \
10636 } \
10637 } \
10638 while (0)
10639
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010640 int kind1, kind2;
10641 void *data1, *data2;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010642 Py_ssize_t len1, len2, len;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010643
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010644 kind1 = PyUnicode_KIND(str1);
10645 kind2 = PyUnicode_KIND(str2);
10646 data1 = PyUnicode_DATA(str1);
10647 data2 = PyUnicode_DATA(str2);
10648 len1 = PyUnicode_GET_LENGTH(str1);
10649 len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner770e19e2012-10-04 22:59:45 +020010650 len = Py_MIN(len1, len2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010651
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010652 switch(kind1) {
10653 case PyUnicode_1BYTE_KIND:
10654 {
10655 switch(kind2) {
10656 case PyUnicode_1BYTE_KIND:
10657 {
10658 int cmp = memcmp(data1, data2, len);
10659 /* normalize result of memcmp() into the range [-1; 1] */
10660 if (cmp < 0)
10661 return -1;
10662 if (cmp > 0)
10663 return 1;
10664 break;
Victor Stinner770e19e2012-10-04 22:59:45 +020010665 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010666 case PyUnicode_2BYTE_KIND:
10667 COMPARE(Py_UCS1, Py_UCS2);
10668 break;
10669 case PyUnicode_4BYTE_KIND:
10670 COMPARE(Py_UCS1, Py_UCS4);
10671 break;
10672 default:
10673 assert(0);
10674 }
10675 break;
10676 }
10677 case PyUnicode_2BYTE_KIND:
10678 {
10679 switch(kind2) {
10680 case PyUnicode_1BYTE_KIND:
10681 COMPARE(Py_UCS2, Py_UCS1);
10682 break;
10683 case PyUnicode_2BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010684 {
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010685 COMPARE(Py_UCS2, Py_UCS2);
10686 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010687 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010688 case PyUnicode_4BYTE_KIND:
10689 COMPARE(Py_UCS2, Py_UCS4);
10690 break;
10691 default:
10692 assert(0);
10693 }
10694 break;
10695 }
10696 case PyUnicode_4BYTE_KIND:
10697 {
10698 switch(kind2) {
10699 case PyUnicode_1BYTE_KIND:
10700 COMPARE(Py_UCS4, Py_UCS1);
10701 break;
10702 case PyUnicode_2BYTE_KIND:
10703 COMPARE(Py_UCS4, Py_UCS2);
10704 break;
10705 case PyUnicode_4BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010706 {
10707#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
10708 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
10709 /* normalize result of wmemcmp() into the range [-1; 1] */
10710 if (cmp < 0)
10711 return -1;
10712 if (cmp > 0)
10713 return 1;
10714#else
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010715 COMPARE(Py_UCS4, Py_UCS4);
Victor Stinnercd777ea2013-04-08 22:43:44 +020010716#endif
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010717 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010718 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010719 default:
10720 assert(0);
10721 }
10722 break;
10723 }
10724 default:
10725 assert(0);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010726 }
10727
Victor Stinner770e19e2012-10-04 22:59:45 +020010728 if (len1 == len2)
10729 return 0;
10730 if (len1 < len2)
10731 return -1;
10732 else
10733 return 1;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010734
10735#undef COMPARE
Marc-André Lemburge5034372000-08-08 08:04:29 +000010736}
10737
Victor Stinnerc8bc5372013-11-04 11:08:10 +010010738Py_LOCAL(int)
Victor Stinnere5567ad2012-10-23 02:48:49 +020010739unicode_compare_eq(PyObject *str1, PyObject *str2)
10740{
10741 int kind;
10742 void *data1, *data2;
10743 Py_ssize_t len;
10744 int cmp;
10745
Victor Stinnere5567ad2012-10-23 02:48:49 +020010746 len = PyUnicode_GET_LENGTH(str1);
10747 if (PyUnicode_GET_LENGTH(str2) != len)
10748 return 0;
10749 kind = PyUnicode_KIND(str1);
10750 if (PyUnicode_KIND(str2) != kind)
10751 return 0;
10752 data1 = PyUnicode_DATA(str1);
10753 data2 = PyUnicode_DATA(str2);
10754
10755 cmp = memcmp(data1, data2, len * kind);
10756 return (cmp == 0);
10757}
10758
10759
Alexander Belopolsky40018472011-02-26 01:02:56 +000010760int
10761PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010762{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010763 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10764 if (PyUnicode_READY(left) == -1 ||
10765 PyUnicode_READY(right) == -1)
10766 return -1;
Victor Stinnerf0c7b2a2013-11-04 11:27:14 +010010767
10768 /* a string is equal to itself */
10769 if (left == right)
10770 return 0;
10771
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010772 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010773 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010774 PyErr_Format(PyExc_TypeError,
10775 "Can't compare %.100s and %.100s",
10776 left->ob_type->tp_name,
10777 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010778 return -1;
10779}
10780
Martin v. Löwis5b222132007-06-10 09:51:05 +000010781int
Victor Stinnerad14ccd2013-11-07 00:46:04 +010010782_PyUnicode_CompareWithId(PyObject *left, _Py_Identifier *right)
10783{
10784 PyObject *right_str = _PyUnicode_FromId(right); /* borrowed */
10785 if (right_str == NULL)
10786 return -1;
10787 return PyUnicode_Compare(left, right_str);
10788}
10789
10790int
Martin v. Löwis5b222132007-06-10 09:51:05 +000010791PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10792{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010793 Py_ssize_t i;
10794 int kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010795 Py_UCS4 chr;
Serhiy Storchaka419967b2016-12-06 00:13:34 +020010796 const unsigned char *ustr = (const unsigned char *)str;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010797
Victor Stinner910337b2011-10-03 03:20:16 +020010798 assert(_PyUnicode_CHECK(uni));
Serhiy Storchaka419967b2016-12-06 00:13:34 +020010799 if (!PyUnicode_IS_READY(uni)) {
10800 const wchar_t *ws = _PyUnicode_WSTR(uni);
10801 /* Compare Unicode string and source character set string */
10802 for (i = 0; (chr = ws[i]) && ustr[i]; i++) {
10803 if (chr != ustr[i])
10804 return (chr < ustr[i]) ? -1 : 1;
10805 }
10806 /* This check keeps Python strings that end in '\0' from comparing equal
10807 to C strings identical up to that point. */
10808 if (_PyUnicode_WSTR_LENGTH(uni) != i || chr)
10809 return 1; /* uni is longer */
10810 if (ustr[i])
10811 return -1; /* str is longer */
10812 return 0;
10813 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010814 kind = PyUnicode_KIND(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010010815 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnera6b9b072013-10-30 18:27:13 +010010816 const void *data = PyUnicode_1BYTE_DATA(uni);
Victor Stinnere1b15922013-11-03 13:53:12 +010010817 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010010818 size_t len, len2 = strlen(str);
10819 int cmp;
10820
10821 len = Py_MIN(len1, len2);
10822 cmp = memcmp(data, str, len);
Victor Stinner21ea21e2013-11-04 11:28:26 +010010823 if (cmp != 0) {
10824 if (cmp < 0)
10825 return -1;
10826 else
10827 return 1;
10828 }
Victor Stinner602f7cf2013-10-29 23:31:50 +010010829 if (len1 > len2)
10830 return 1; /* uni is longer */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010831 if (len1 < len2)
Victor Stinner602f7cf2013-10-29 23:31:50 +010010832 return -1; /* str is longer */
10833 return 0;
10834 }
10835 else {
10836 void *data = PyUnicode_DATA(uni);
10837 /* Compare Unicode string and source character set string */
10838 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
Victor Stinner12174a52014-08-15 23:17:38 +020010839 if (chr != (unsigned char)str[i])
Victor Stinner602f7cf2013-10-29 23:31:50 +010010840 return (chr < (unsigned char)(str[i])) ? -1 : 1;
10841 /* This check keeps Python strings that end in '\0' from comparing equal
10842 to C strings identical up to that point. */
10843 if (PyUnicode_GET_LENGTH(uni) != i || chr)
10844 return 1; /* uni is longer */
10845 if (str[i])
10846 return -1; /* str is longer */
10847 return 0;
10848 }
Martin v. Löwis5b222132007-06-10 09:51:05 +000010849}
10850
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020010851static int
10852non_ready_unicode_equal_to_ascii_string(PyObject *unicode, const char *str)
10853{
10854 size_t i, len;
10855 const wchar_t *p;
10856 len = (size_t)_PyUnicode_WSTR_LENGTH(unicode);
10857 if (strlen(str) != len)
10858 return 0;
10859 p = _PyUnicode_WSTR(unicode);
10860 assert(p);
10861 for (i = 0; i < len; i++) {
10862 unsigned char c = (unsigned char)str[i];
Serhiy Storchaka292dd1b2016-11-16 16:12:34 +020010863 if (c >= 128 || p[i] != (wchar_t)c)
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020010864 return 0;
10865 }
10866 return 1;
10867}
10868
10869int
10870_PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
10871{
10872 size_t len;
10873 assert(_PyUnicode_CHECK(unicode));
10874 if (PyUnicode_READY(unicode) == -1) {
10875 /* Memory error or bad data */
10876 PyErr_Clear();
10877 return non_ready_unicode_equal_to_ascii_string(unicode, str);
10878 }
10879 if (!PyUnicode_IS_ASCII(unicode))
10880 return 0;
10881 len = (size_t)PyUnicode_GET_LENGTH(unicode);
10882 return strlen(str) == len &&
10883 memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
10884}
10885
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020010886int
10887_PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right)
10888{
10889 PyObject *right_uni;
10890 Py_hash_t hash;
10891
10892 assert(_PyUnicode_CHECK(left));
10893 assert(right->string);
10894
10895 if (PyUnicode_READY(left) == -1) {
10896 /* memory error or bad data */
10897 PyErr_Clear();
10898 return non_ready_unicode_equal_to_ascii_string(left, right->string);
10899 }
10900
10901 if (!PyUnicode_IS_ASCII(left))
10902 return 0;
10903
10904 right_uni = _PyUnicode_FromId(right); /* borrowed */
10905 if (right_uni == NULL) {
10906 /* memory error or bad data */
10907 PyErr_Clear();
10908 return _PyUnicode_EqualToASCIIString(left, right->string);
10909 }
10910
10911 if (left == right_uni)
10912 return 1;
10913
10914 if (PyUnicode_CHECK_INTERNED(left))
10915 return 0;
10916
10917 assert(_PyUnicode_HASH(right_uni) != 1);
10918 hash = _PyUnicode_HASH(left);
10919 if (hash != -1 && hash != _PyUnicode_HASH(right_uni))
10920 return 0;
10921
10922 return unicode_compare_eq(left, right_uni);
10923}
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010924
Benjamin Peterson29060642009-01-31 22:14:21 +000010925#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000010926 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010927
Alexander Belopolsky40018472011-02-26 01:02:56 +000010928PyObject *
10929PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010930{
10931 int result;
Victor Stinnere5567ad2012-10-23 02:48:49 +020010932 PyObject *v;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010933
Victor Stinnere5567ad2012-10-23 02:48:49 +020010934 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
10935 Py_RETURN_NOTIMPLEMENTED;
10936
10937 if (PyUnicode_READY(left) == -1 ||
10938 PyUnicode_READY(right) == -1)
10939 return NULL;
10940
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010010941 if (left == right) {
10942 switch (op) {
10943 case Py_EQ:
10944 case Py_LE:
10945 case Py_GE:
10946 /* a string is equal to itself */
10947 v = Py_True;
10948 break;
10949 case Py_NE:
10950 case Py_LT:
10951 case Py_GT:
10952 v = Py_False;
10953 break;
10954 default:
10955 PyErr_BadArgument();
10956 return NULL;
10957 }
10958 }
10959 else if (op == Py_EQ || op == Py_NE) {
Victor Stinnere5567ad2012-10-23 02:48:49 +020010960 result = unicode_compare_eq(left, right);
Victor Stinnerc8bc5372013-11-04 11:08:10 +010010961 result ^= (op == Py_NE);
10962 v = TEST_COND(result);
Victor Stinnere5567ad2012-10-23 02:48:49 +020010963 }
10964 else {
Victor Stinner90db9c42012-10-04 21:53:50 +020010965 result = unicode_compare(left, right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010966
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010967 /* Convert the return value to a Boolean */
10968 switch (op) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010969 case Py_LE:
10970 v = TEST_COND(result <= 0);
10971 break;
10972 case Py_GE:
10973 v = TEST_COND(result >= 0);
10974 break;
10975 case Py_LT:
10976 v = TEST_COND(result == -1);
10977 break;
10978 case Py_GT:
10979 v = TEST_COND(result == 1);
10980 break;
10981 default:
10982 PyErr_BadArgument();
10983 return NULL;
10984 }
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010985 }
Victor Stinnere5567ad2012-10-23 02:48:49 +020010986 Py_INCREF(v);
10987 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010988}
10989
Alexander Belopolsky40018472011-02-26 01:02:56 +000010990int
10991PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +000010992{
Thomas Wouters477c8d52006-05-27 19:21:47 +000010993 PyObject *str, *sub;
Victor Stinner77282cb2013-04-14 19:22:47 +020010994 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010995 void *buf1, *buf2;
10996 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010997 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010998
10999 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +000011000 sub = PyUnicode_FromObject(element);
11001 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011002 PyErr_Format(PyExc_TypeError,
11003 "'in <string>' requires string as left operand, not %s",
11004 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011005 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011006 }
11007
Thomas Wouters477c8d52006-05-27 19:21:47 +000011008 str = PyUnicode_FromObject(container);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011009 if (!str) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000011010 Py_DECREF(sub);
11011 return -1;
11012 }
11013
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011014 kind1 = PyUnicode_KIND(str);
11015 kind2 = PyUnicode_KIND(sub);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011016 if (kind1 < kind2) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011017 Py_DECREF(sub);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050011018 Py_DECREF(str);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011019 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011020 }
11021 len1 = PyUnicode_GET_LENGTH(str);
11022 len2 = PyUnicode_GET_LENGTH(sub);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011023 if (len1 < len2) {
11024 Py_DECREF(sub);
11025 Py_DECREF(str);
11026 return 0;
11027 }
11028 buf1 = PyUnicode_DATA(str);
11029 buf2 = PyUnicode_DATA(sub);
11030 if (len2 == 1) {
11031 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11032 result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
11033 Py_DECREF(sub);
11034 Py_DECREF(str);
11035 return result;
11036 }
11037 if (kind2 != kind1) {
11038 buf2 = _PyUnicode_AsKind(sub, kind1);
11039 if (!buf2) {
11040 Py_DECREF(sub);
11041 Py_DECREF(str);
11042 return -1;
11043 }
11044 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011045
Victor Stinner77282cb2013-04-14 19:22:47 +020011046 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011047 case PyUnicode_1BYTE_KIND:
11048 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11049 break;
11050 case PyUnicode_2BYTE_KIND:
11051 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11052 break;
11053 case PyUnicode_4BYTE_KIND:
11054 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11055 break;
11056 default:
11057 result = -1;
11058 assert(0);
11059 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011060
11061 Py_DECREF(str);
11062 Py_DECREF(sub);
11063
Victor Stinner77282cb2013-04-14 19:22:47 +020011064 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011065 PyMem_Free(buf2);
11066
Guido van Rossum403d68b2000-03-13 15:55:09 +000011067 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011068}
11069
Guido van Rossumd57fd912000-03-10 22:53:23 +000011070/* Concat to string or Unicode object giving a new Unicode object. */
11071
Alexander Belopolsky40018472011-02-26 01:02:56 +000011072PyObject *
11073PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011074{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011075 PyObject *u = NULL, *v = NULL, *w;
Victor Stinner127226b2011-10-13 01:12:34 +020011076 Py_UCS4 maxchar, maxchar2;
Victor Stinner488fa492011-12-12 00:01:39 +010011077 Py_ssize_t u_len, v_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011078
11079 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011080 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011081 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000011082 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011083 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011084 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000011085 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011086
11087 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +020011088 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011089 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011090 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011091 }
Victor Stinnera464fc12011-10-02 20:39:30 +020011092 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011093 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011094 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011095 }
11096
Victor Stinner488fa492011-12-12 00:01:39 +010011097 u_len = PyUnicode_GET_LENGTH(u);
11098 v_len = PyUnicode_GET_LENGTH(v);
11099 if (u_len > PY_SSIZE_T_MAX - v_len) {
11100 PyErr_SetString(PyExc_OverflowError,
11101 "strings are too large to concat");
11102 goto onError;
11103 }
11104 new_len = u_len + v_len;
11105
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011106 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinner127226b2011-10-13 01:12:34 +020011107 maxchar2 = PyUnicode_MAX_CHAR_VALUE(v);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011108 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011109
Guido van Rossumd57fd912000-03-10 22:53:23 +000011110 /* Concat the two Unicode strings */
Victor Stinner488fa492011-12-12 00:01:39 +010011111 w = PyUnicode_New(new_len, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011112 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000011113 goto onError;
Victor Stinnerd3f08822012-05-29 12:57:52 +020011114 _PyUnicode_FastCopyCharacters(w, 0, u, 0, u_len);
11115 _PyUnicode_FastCopyCharacters(w, u_len, v, 0, v_len);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011116 Py_DECREF(u);
11117 Py_DECREF(v);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011118 assert(_PyUnicode_CheckConsistency(w, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011119 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011120
Benjamin Peterson29060642009-01-31 22:14:21 +000011121 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000011122 Py_XDECREF(u);
11123 Py_XDECREF(v);
11124 return NULL;
11125}
11126
Walter Dörwald1ab83302007-05-18 17:15:44 +000011127void
Victor Stinner23e56682011-10-03 03:54:37 +020011128PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000011129{
Victor Stinner23e56682011-10-03 03:54:37 +020011130 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010011131 Py_UCS4 maxchar, maxchar2;
11132 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020011133
11134 if (p_left == NULL) {
11135 if (!PyErr_Occurred())
11136 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000011137 return;
11138 }
Victor Stinner23e56682011-10-03 03:54:37 +020011139 left = *p_left;
Victor Stinnerf0335102013-04-14 19:13:03 +020011140 if (right == NULL || left == NULL
11141 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
Victor Stinner23e56682011-10-03 03:54:37 +020011142 if (!PyErr_Occurred())
11143 PyErr_BadInternalCall();
11144 goto error;
11145 }
11146
Benjamin Petersonbac79492012-01-14 13:34:47 -050011147 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011148 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050011149 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011150 goto error;
11151
Victor Stinner488fa492011-12-12 00:01:39 +010011152 /* Shortcuts */
11153 if (left == unicode_empty) {
11154 Py_DECREF(left);
11155 Py_INCREF(right);
11156 *p_left = right;
11157 return;
11158 }
11159 if (right == unicode_empty)
11160 return;
11161
11162 left_len = PyUnicode_GET_LENGTH(left);
11163 right_len = PyUnicode_GET_LENGTH(right);
11164 if (left_len > PY_SSIZE_T_MAX - right_len) {
11165 PyErr_SetString(PyExc_OverflowError,
11166 "strings are too large to concat");
11167 goto error;
11168 }
11169 new_len = left_len + right_len;
11170
11171 if (unicode_modifiable(left)
11172 && PyUnicode_CheckExact(right)
11173 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020011174 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11175 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020011176 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020011177 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010011178 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11179 {
11180 /* append inplace */
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011181 if (unicode_resize(p_left, new_len) != 0)
Victor Stinner488fa492011-12-12 00:01:39 +010011182 goto error;
Victor Stinnerf0335102013-04-14 19:13:03 +020011183
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011184 /* copy 'right' into the newly allocated area of 'left' */
11185 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020011186 }
Victor Stinner488fa492011-12-12 00:01:39 +010011187 else {
11188 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11189 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011190 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020011191
Victor Stinner488fa492011-12-12 00:01:39 +010011192 /* Concat the two Unicode strings */
11193 res = PyUnicode_New(new_len, maxchar);
11194 if (res == NULL)
11195 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020011196 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11197 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010011198 Py_DECREF(left);
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011199 *p_left = res;
Victor Stinner488fa492011-12-12 00:01:39 +010011200 }
11201 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020011202 return;
11203
11204error:
Victor Stinner488fa492011-12-12 00:01:39 +010011205 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011206}
11207
11208void
11209PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11210{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011211 PyUnicode_Append(pleft, right);
11212 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011213}
11214
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011215PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011216 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011217\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011218Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011219string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011220interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011221
11222static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011223unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011224{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011225 PyObject *substring = NULL; /* initialize to fix a compiler warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011226 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011227 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011228 PyObject *result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011229 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011230 void *buf1, *buf2;
11231 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011232
Jesus Ceaac451502011-04-20 17:09:23 +020011233 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
11234 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011235 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000011236
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011237 kind1 = PyUnicode_KIND(self);
11238 kind2 = PyUnicode_KIND(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011239 if (kind1 < kind2) {
Christian Heimesd47802e2013-06-29 21:33:36 +020011240 Py_DECREF(substring);
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040011241 return PyLong_FromLong(0);
Christian Heimesd47802e2013-06-29 21:33:36 +020011242 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011243 len1 = PyUnicode_GET_LENGTH(self);
11244 len2 = PyUnicode_GET_LENGTH(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011245 ADJUST_INDICES(start, end, len1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011246 if (end - start < len2) {
11247 Py_DECREF(substring);
11248 return PyLong_FromLong(0);
11249 }
11250 buf1 = PyUnicode_DATA(self);
11251 buf2 = PyUnicode_DATA(substring);
11252 if (kind2 != kind1) {
11253 buf2 = _PyUnicode_AsKind(substring, kind1);
11254 if (!buf2) {
11255 Py_DECREF(substring);
11256 return NULL;
11257 }
11258 }
11259 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011260 case PyUnicode_1BYTE_KIND:
11261 iresult = ucs1lib_count(
11262 ((Py_UCS1*)buf1) + start, end - start,
11263 buf2, len2, PY_SSIZE_T_MAX
11264 );
11265 break;
11266 case PyUnicode_2BYTE_KIND:
11267 iresult = ucs2lib_count(
11268 ((Py_UCS2*)buf1) + start, end - start,
11269 buf2, len2, PY_SSIZE_T_MAX
11270 );
11271 break;
11272 case PyUnicode_4BYTE_KIND:
11273 iresult = ucs4lib_count(
11274 ((Py_UCS4*)buf1) + start, end - start,
11275 buf2, len2, PY_SSIZE_T_MAX
11276 );
11277 break;
11278 default:
11279 assert(0); iresult = 0;
11280 }
11281
11282 result = PyLong_FromSsize_t(iresult);
11283
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011284 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011285 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011286
11287 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011288
Guido van Rossumd57fd912000-03-10 22:53:23 +000011289 return result;
11290}
11291
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011292PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000011293 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011294\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000011295Encode S using the codec registered for encoding. Default encoding\n\
11296is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000011297handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000011298a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
11299'xmlcharrefreplace' as well as any other name registered with\n\
11300codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011301
11302static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011303unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011304{
Benjamin Peterson308d6372009-09-18 21:42:35 +000011305 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000011306 char *encoding = NULL;
11307 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000011308
Benjamin Peterson308d6372009-09-18 21:42:35 +000011309 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
11310 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011311 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011312 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011313}
11314
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011315PyDoc_STRVAR(expandtabs__doc__,
Ezio Melotti745d54d2013-11-16 19:10:57 +020011316 "S.expandtabs(tabsize=8) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011317\n\
11318Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011319If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011320
11321static PyObject*
Ezio Melotti745d54d2013-11-16 19:10:57 +020011322unicode_expandtabs(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011323{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011324 Py_ssize_t i, j, line_pos, src_len, incr;
11325 Py_UCS4 ch;
11326 PyObject *u;
11327 void *src_data, *dest_data;
Ezio Melotti745d54d2013-11-16 19:10:57 +020011328 static char *kwlist[] = {"tabsize", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000011329 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011330 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011331 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011332
Ezio Melotti745d54d2013-11-16 19:10:57 +020011333 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:expandtabs",
11334 kwlist, &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000011335 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011336
Antoine Pitrou22425222011-10-04 19:10:51 +020011337 if (PyUnicode_READY(self) == -1)
11338 return NULL;
11339
Thomas Wouters7e474022000-07-16 12:04:32 +000011340 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011341 src_len = PyUnicode_GET_LENGTH(self);
11342 i = j = line_pos = 0;
11343 kind = PyUnicode_KIND(self);
11344 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011345 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011346 for (; i < src_len; i++) {
11347 ch = PyUnicode_READ(kind, src_data, i);
11348 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011349 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011350 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011351 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011352 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011353 goto overflow;
11354 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011355 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011356 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011357 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011358 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011359 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011360 goto overflow;
11361 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011362 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011363 if (ch == '\n' || ch == '\r')
11364 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011365 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011366 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011367 if (!found)
11368 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011369
Guido van Rossumd57fd912000-03-10 22:53:23 +000011370 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011371 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011372 if (!u)
11373 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011374 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011375
Antoine Pitroue71d5742011-10-04 15:55:09 +020011376 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011377
Antoine Pitroue71d5742011-10-04 15:55:09 +020011378 for (; i < src_len; i++) {
11379 ch = PyUnicode_READ(kind, src_data, i);
11380 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011381 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011382 incr = tabsize - (line_pos % tabsize);
11383 line_pos += incr;
Victor Stinnerda79e632012-02-22 13:37:04 +010011384 FILL(kind, dest_data, ' ', j, incr);
11385 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011386 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011387 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011388 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011389 line_pos++;
11390 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011391 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011392 if (ch == '\n' || ch == '\r')
11393 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011394 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011395 }
11396 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011397 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011398
Antoine Pitroue71d5742011-10-04 15:55:09 +020011399 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011400 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11401 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011402}
11403
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011404PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011405 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011406\n\
11407Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011408such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011409arguments start and end are interpreted as in slice notation.\n\
11410\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011411Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011412
11413static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011414unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011415{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011416 /* initialize variables to prevent gcc warning */
11417 PyObject *substring = NULL;
11418 Py_ssize_t start = 0;
11419 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011420 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011421
Jesus Ceaac451502011-04-20 17:09:23 +020011422 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
11423 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011424 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011425
Christian Heimesd47802e2013-06-29 21:33:36 +020011426 if (PyUnicode_READY(self) == -1) {
11427 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011428 return NULL;
Christian Heimesd47802e2013-06-29 21:33:36 +020011429 }
11430 if (PyUnicode_READY(substring) == -1) {
11431 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011432 return NULL;
Christian Heimesd47802e2013-06-29 21:33:36 +020011433 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011434
Victor Stinner7931d9a2011-11-04 00:22:48 +010011435 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011436
11437 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011438
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011439 if (result == -2)
11440 return NULL;
11441
Christian Heimes217cfd12007-12-02 14:31:20 +000011442 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011443}
11444
11445static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011446unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011447{
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011448 void *data;
11449 enum PyUnicode_Kind kind;
11450 Py_UCS4 ch;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011451
11452 if (!PyUnicode_Check(self) || PyUnicode_READY(self) == -1) {
11453 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011454 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011455 }
11456 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11457 PyErr_SetString(PyExc_IndexError, "string index out of range");
11458 return NULL;
11459 }
11460 kind = PyUnicode_KIND(self);
11461 data = PyUnicode_DATA(self);
11462 ch = PyUnicode_READ(kind, data, index);
Victor Stinner985a82a2014-01-03 12:53:47 +010011463 return unicode_char(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011464}
11465
Guido van Rossumc2504932007-09-18 19:42:40 +000011466/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011467 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011468static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011469unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011470{
Guido van Rossumc2504932007-09-18 19:42:40 +000011471 Py_ssize_t len;
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080011472 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000011473
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011474#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011475 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011476#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011477 if (_PyUnicode_HASH(self) != -1)
11478 return _PyUnicode_HASH(self);
11479 if (PyUnicode_READY(self) == -1)
11480 return -1;
11481 len = PyUnicode_GET_LENGTH(self);
Georg Brandl16fa2a12012-02-21 00:50:13 +010011482 /*
11483 We make the hash of the empty string be 0, rather than using
11484 (prefix ^ suffix), since this slightly obfuscates the hash secret
11485 */
11486 if (len == 0) {
11487 _PyUnicode_HASH(self) = 0;
11488 return 0;
11489 }
Christian Heimes985ecdc2013-11-20 11:46:18 +010011490 x = _Py_HashBytes(PyUnicode_DATA(self),
11491 PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011492 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011493 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011494}
11495
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011496PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011497 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011498\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011499Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011500
11501static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011502unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011503{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011504 /* initialize variables to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011505 Py_ssize_t result;
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011506 PyObject *substring = NULL;
11507 Py_ssize_t start = 0;
11508 Py_ssize_t end = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011509
Jesus Ceaac451502011-04-20 17:09:23 +020011510 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
11511 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011512 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011513
Christian Heimesd47a0452013-06-29 21:21:37 +020011514 if (PyUnicode_READY(self) == -1) {
11515 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011516 return NULL;
Christian Heimesd47a0452013-06-29 21:21:37 +020011517 }
11518 if (PyUnicode_READY(substring) == -1) {
11519 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011520 return NULL;
Christian Heimesd47a0452013-06-29 21:21:37 +020011521 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011522
Victor Stinner7931d9a2011-11-04 00:22:48 +010011523 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011524
11525 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011526
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011527 if (result == -2)
11528 return NULL;
11529
Guido van Rossumd57fd912000-03-10 22:53:23 +000011530 if (result < 0) {
11531 PyErr_SetString(PyExc_ValueError, "substring not found");
11532 return NULL;
11533 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011534
Christian Heimes217cfd12007-12-02 14:31:20 +000011535 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011536}
11537
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011538PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011539 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011540\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011541Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011542at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011543
11544static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011545unicode_islower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011546{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011547 Py_ssize_t i, length;
11548 int kind;
11549 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011550 int cased;
11551
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011552 if (PyUnicode_READY(self) == -1)
11553 return NULL;
11554 length = PyUnicode_GET_LENGTH(self);
11555 kind = PyUnicode_KIND(self);
11556 data = PyUnicode_DATA(self);
11557
Guido van Rossumd57fd912000-03-10 22:53:23 +000011558 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011559 if (length == 1)
11560 return PyBool_FromLong(
11561 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011562
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011563 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011564 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011565 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011566
Guido van Rossumd57fd912000-03-10 22:53:23 +000011567 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011568 for (i = 0; i < length; i++) {
11569 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011570
Benjamin Peterson29060642009-01-31 22:14:21 +000011571 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11572 return PyBool_FromLong(0);
11573 else if (!cased && Py_UNICODE_ISLOWER(ch))
11574 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011575 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011576 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011577}
11578
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011579PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011580 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011581\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011582Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011583at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011584
11585static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011586unicode_isupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011587{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011588 Py_ssize_t i, length;
11589 int kind;
11590 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011591 int cased;
11592
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011593 if (PyUnicode_READY(self) == -1)
11594 return NULL;
11595 length = PyUnicode_GET_LENGTH(self);
11596 kind = PyUnicode_KIND(self);
11597 data = PyUnicode_DATA(self);
11598
Guido van Rossumd57fd912000-03-10 22:53:23 +000011599 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011600 if (length == 1)
11601 return PyBool_FromLong(
11602 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011603
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011604 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011605 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011606 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011607
Guido van Rossumd57fd912000-03-10 22:53:23 +000011608 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011609 for (i = 0; i < length; i++) {
11610 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011611
Benjamin Peterson29060642009-01-31 22:14:21 +000011612 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11613 return PyBool_FromLong(0);
11614 else if (!cased && Py_UNICODE_ISUPPER(ch))
11615 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011616 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011617 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011618}
11619
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011620PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011621 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011622\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011623Return True if S is a titlecased string and there is at least one\n\
11624character in S, i.e. upper- and titlecase characters may only\n\
11625follow uncased characters and lowercase characters only cased ones.\n\
11626Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011627
11628static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011629unicode_istitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011630{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011631 Py_ssize_t i, length;
11632 int kind;
11633 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011634 int cased, previous_is_cased;
11635
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011636 if (PyUnicode_READY(self) == -1)
11637 return NULL;
11638 length = PyUnicode_GET_LENGTH(self);
11639 kind = PyUnicode_KIND(self);
11640 data = PyUnicode_DATA(self);
11641
Guido van Rossumd57fd912000-03-10 22:53:23 +000011642 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011643 if (length == 1) {
11644 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11645 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11646 (Py_UNICODE_ISUPPER(ch) != 0));
11647 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011648
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011649 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011650 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011651 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011652
Guido van Rossumd57fd912000-03-10 22:53:23 +000011653 cased = 0;
11654 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011655 for (i = 0; i < length; i++) {
11656 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011657
Benjamin Peterson29060642009-01-31 22:14:21 +000011658 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11659 if (previous_is_cased)
11660 return PyBool_FromLong(0);
11661 previous_is_cased = 1;
11662 cased = 1;
11663 }
11664 else if (Py_UNICODE_ISLOWER(ch)) {
11665 if (!previous_is_cased)
11666 return PyBool_FromLong(0);
11667 previous_is_cased = 1;
11668 cased = 1;
11669 }
11670 else
11671 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011672 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011673 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011674}
11675
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011676PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011677 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011678\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011679Return True if all characters in S are whitespace\n\
11680and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011681
11682static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011683unicode_isspace(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011684{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011685 Py_ssize_t i, length;
11686 int kind;
11687 void *data;
11688
11689 if (PyUnicode_READY(self) == -1)
11690 return NULL;
11691 length = PyUnicode_GET_LENGTH(self);
11692 kind = PyUnicode_KIND(self);
11693 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011694
Guido van Rossumd57fd912000-03-10 22:53:23 +000011695 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011696 if (length == 1)
11697 return PyBool_FromLong(
11698 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011699
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011700 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011701 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011702 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011703
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011704 for (i = 0; i < length; i++) {
11705 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011706 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011707 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011708 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011709 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011710}
11711
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011712PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011713 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011714\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011715Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011716and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011717
11718static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011719unicode_isalpha(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011720{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011721 Py_ssize_t i, length;
11722 int kind;
11723 void *data;
11724
11725 if (PyUnicode_READY(self) == -1)
11726 return NULL;
11727 length = PyUnicode_GET_LENGTH(self);
11728 kind = PyUnicode_KIND(self);
11729 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011730
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011731 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011732 if (length == 1)
11733 return PyBool_FromLong(
11734 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011735
11736 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011737 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011738 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011739
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011740 for (i = 0; i < length; i++) {
11741 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011742 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011743 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011744 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011745}
11746
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011747PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011748 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011749\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011750Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011751and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011752
11753static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011754unicode_isalnum(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011755{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011756 int kind;
11757 void *data;
11758 Py_ssize_t len, i;
11759
11760 if (PyUnicode_READY(self) == -1)
11761 return NULL;
11762
11763 kind = PyUnicode_KIND(self);
11764 data = PyUnicode_DATA(self);
11765 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011766
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011767 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011768 if (len == 1) {
11769 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11770 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11771 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011772
11773 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011774 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011775 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011776
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011777 for (i = 0; i < len; i++) {
11778 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011779 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011780 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011781 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011782 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011783}
11784
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011785PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011786 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011787\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011788Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011789False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011790
11791static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011792unicode_isdecimal(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011793{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011794 Py_ssize_t i, length;
11795 int kind;
11796 void *data;
11797
11798 if (PyUnicode_READY(self) == -1)
11799 return NULL;
11800 length = PyUnicode_GET_LENGTH(self);
11801 kind = PyUnicode_KIND(self);
11802 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011803
Guido van Rossumd57fd912000-03-10 22:53:23 +000011804 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011805 if (length == 1)
11806 return PyBool_FromLong(
11807 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011808
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011809 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011810 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011811 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011812
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011813 for (i = 0; i < length; i++) {
11814 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011815 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011816 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011817 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011818}
11819
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011820PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011821 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011822\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011823Return True if all characters in S are digits\n\
11824and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011825
11826static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011827unicode_isdigit(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011828{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011829 Py_ssize_t i, length;
11830 int kind;
11831 void *data;
11832
11833 if (PyUnicode_READY(self) == -1)
11834 return NULL;
11835 length = PyUnicode_GET_LENGTH(self);
11836 kind = PyUnicode_KIND(self);
11837 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011838
Guido van Rossumd57fd912000-03-10 22:53:23 +000011839 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011840 if (length == 1) {
11841 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11842 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11843 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011844
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011845 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011846 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011847 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011848
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011849 for (i = 0; i < length; i++) {
11850 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011851 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011852 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011853 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011854}
11855
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011856PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011857 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011858\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011859Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011860False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011861
11862static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011863unicode_isnumeric(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011864{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011865 Py_ssize_t i, length;
11866 int kind;
11867 void *data;
11868
11869 if (PyUnicode_READY(self) == -1)
11870 return NULL;
11871 length = PyUnicode_GET_LENGTH(self);
11872 kind = PyUnicode_KIND(self);
11873 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011874
Guido van Rossumd57fd912000-03-10 22:53:23 +000011875 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011876 if (length == 1)
11877 return PyBool_FromLong(
11878 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011879
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011880 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011881 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011882 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011883
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011884 for (i = 0; i < length; i++) {
11885 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011886 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011887 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011888 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011889}
11890
Martin v. Löwis47383402007-08-15 07:32:56 +000011891int
11892PyUnicode_IsIdentifier(PyObject *self)
11893{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011894 int kind;
11895 void *data;
11896 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011897 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000011898
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011899 if (PyUnicode_READY(self) == -1) {
11900 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000011901 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011902 }
11903
11904 /* Special case for empty strings */
11905 if (PyUnicode_GET_LENGTH(self) == 0)
11906 return 0;
11907 kind = PyUnicode_KIND(self);
11908 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000011909
11910 /* PEP 3131 says that the first character must be in
11911 XID_Start and subsequent characters in XID_Continue,
11912 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000011913 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000011914 letters, digits, underscore). However, given the current
11915 definition of XID_Start and XID_Continue, it is sufficient
11916 to check just for these, except that _ must be allowed
11917 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011918 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050011919 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000011920 return 0;
11921
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040011922 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011923 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011924 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000011925 return 1;
11926}
11927
11928PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011929 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000011930\n\
11931Return True if S is a valid identifier according\n\
Raymond Hettinger378170d2013-03-23 08:21:12 -070011932to the language definition.\n\
11933\n\
11934Use keyword.iskeyword() to test for reserved identifiers\n\
11935such as \"def\" and \"class\".\n");
Martin v. Löwis47383402007-08-15 07:32:56 +000011936
11937static PyObject*
11938unicode_isidentifier(PyObject *self)
11939{
11940 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11941}
11942
Georg Brandl559e5d72008-06-11 18:37:52 +000011943PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011944 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000011945\n\
11946Return True if all characters in S are considered\n\
11947printable in repr() or S is empty, False otherwise.");
11948
11949static PyObject*
11950unicode_isprintable(PyObject *self)
11951{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011952 Py_ssize_t i, length;
11953 int kind;
11954 void *data;
11955
11956 if (PyUnicode_READY(self) == -1)
11957 return NULL;
11958 length = PyUnicode_GET_LENGTH(self);
11959 kind = PyUnicode_KIND(self);
11960 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000011961
11962 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011963 if (length == 1)
11964 return PyBool_FromLong(
11965 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000011966
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011967 for (i = 0; i < length; i++) {
11968 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011969 Py_RETURN_FALSE;
11970 }
11971 }
11972 Py_RETURN_TRUE;
11973}
11974
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011975PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000011976 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011977\n\
11978Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000011979iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011980
11981static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011982unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011983{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011984 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011985}
11986
Martin v. Löwis18e16552006-02-15 17:27:45 +000011987static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011988unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011989{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011990 if (PyUnicode_READY(self) == -1)
11991 return -1;
11992 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011993}
11994
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011995PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011996 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011997\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011998Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011999done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012000
12001static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012002unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012003{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012004 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012005 Py_UCS4 fillchar = ' ';
12006
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012007 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012008 return NULL;
12009
Benjamin Petersonbac79492012-01-14 13:34:47 -050012010 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012011 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012012
Victor Stinnerc4b49542011-12-11 22:44:26 +010012013 if (PyUnicode_GET_LENGTH(self) >= width)
12014 return unicode_result_unchanged(self);
12015
12016 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012017}
12018
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012019PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012020 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012021\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012022Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012023
12024static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012025unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012026{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012027 if (PyUnicode_READY(self) == -1)
12028 return NULL;
12029 if (PyUnicode_IS_ASCII(self))
12030 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012031 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012032}
12033
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012034#define LEFTSTRIP 0
12035#define RIGHTSTRIP 1
12036#define BOTHSTRIP 2
12037
12038/* Arrays indexed by above */
12039static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
12040
12041#define STRIPNAME(i) (stripformat[i]+3)
12042
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012043/* externally visible for str.strip(unicode) */
12044PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012045_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012046{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012047 void *data;
12048 int kind;
12049 Py_ssize_t i, j, len;
12050 BLOOM_MASK sepmask;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012051 Py_ssize_t seplen;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012052
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012053 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
12054 return NULL;
12055
12056 kind = PyUnicode_KIND(self);
12057 data = PyUnicode_DATA(self);
12058 len = PyUnicode_GET_LENGTH(self);
Victor Stinnerb3a60142013-04-09 22:19:21 +020012059 seplen = PyUnicode_GET_LENGTH(sepobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012060 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12061 PyUnicode_DATA(sepobj),
Victor Stinnerb3a60142013-04-09 22:19:21 +020012062 seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012063
Benjamin Peterson14339b62009-01-31 16:36:08 +000012064 i = 0;
12065 if (striptype != RIGHTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012066 while (i < len) {
12067 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12068 if (!BLOOM(sepmask, ch))
12069 break;
12070 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12071 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012072 i++;
12073 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012074 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012075
Benjamin Peterson14339b62009-01-31 16:36:08 +000012076 j = len;
12077 if (striptype != LEFTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012078 j--;
12079 while (j >= i) {
12080 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12081 if (!BLOOM(sepmask, ch))
12082 break;
12083 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12084 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012085 j--;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012086 }
12087
Benjamin Peterson29060642009-01-31 22:14:21 +000012088 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012089 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012090
Victor Stinner7931d9a2011-11-04 00:22:48 +010012091 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012092}
12093
12094PyObject*
12095PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12096{
12097 unsigned char *data;
12098 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020012099 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012100
Victor Stinnerde636f32011-10-01 03:55:54 +020012101 if (PyUnicode_READY(self) == -1)
12102 return NULL;
12103
Victor Stinner684d5fd2012-05-03 02:32:34 +020012104 length = PyUnicode_GET_LENGTH(self);
12105 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020012106
Victor Stinner684d5fd2012-05-03 02:32:34 +020012107 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012108 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012109
Victor Stinnerde636f32011-10-01 03:55:54 +020012110 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012111 PyErr_SetString(PyExc_IndexError, "string index out of range");
12112 return NULL;
12113 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020012114 if (start >= length || end < start)
12115 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner12bab6d2011-10-01 01:53:49 +020012116
Victor Stinner684d5fd2012-05-03 02:32:34 +020012117 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020012118 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020012119 data = PyUnicode_1BYTE_DATA(self);
Victor Stinnerd3f08822012-05-29 12:57:52 +020012120 return _PyUnicode_FromASCII((char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020012121 }
12122 else {
12123 kind = PyUnicode_KIND(self);
12124 data = PyUnicode_1BYTE_DATA(self);
12125 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012126 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020012127 length);
12128 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012129}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012130
12131static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012132do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012133{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012134 Py_ssize_t len, i, j;
12135
12136 if (PyUnicode_READY(self) == -1)
12137 return NULL;
12138
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012139 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012140
Victor Stinnercc7af722013-04-09 22:39:24 +020012141 if (PyUnicode_IS_ASCII(self)) {
12142 Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12143
12144 i = 0;
12145 if (striptype != RIGHTSTRIP) {
12146 while (i < len) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012147 Py_UCS1 ch = data[i];
Victor Stinnercc7af722013-04-09 22:39:24 +020012148 if (!_Py_ascii_whitespace[ch])
12149 break;
12150 i++;
12151 }
12152 }
12153
12154 j = len;
12155 if (striptype != LEFTSTRIP) {
12156 j--;
12157 while (j >= i) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012158 Py_UCS1 ch = data[j];
Victor Stinnercc7af722013-04-09 22:39:24 +020012159 if (!_Py_ascii_whitespace[ch])
12160 break;
12161 j--;
12162 }
12163 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012164 }
12165 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012166 else {
12167 int kind = PyUnicode_KIND(self);
12168 void *data = PyUnicode_DATA(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012169
Victor Stinnercc7af722013-04-09 22:39:24 +020012170 i = 0;
12171 if (striptype != RIGHTSTRIP) {
12172 while (i < len) {
12173 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12174 if (!Py_UNICODE_ISSPACE(ch))
12175 break;
12176 i++;
12177 }
Victor Stinner9c79e412013-04-09 22:21:08 +020012178 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012179
12180 j = len;
12181 if (striptype != LEFTSTRIP) {
12182 j--;
12183 while (j >= i) {
12184 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12185 if (!Py_UNICODE_ISSPACE(ch))
12186 break;
12187 j--;
12188 }
12189 j++;
12190 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012191 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012192
Victor Stinner7931d9a2011-11-04 00:22:48 +010012193 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012194}
12195
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012196
12197static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012198do_argstrip(PyObject *self, int striptype, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012199{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012200 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012201
Serhiy Storchakac6792272013-10-19 21:03:34 +030012202 if (!PyArg_ParseTuple(args, stripformat[striptype], &sep))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012203 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012204
Benjamin Peterson14339b62009-01-31 16:36:08 +000012205 if (sep != NULL && sep != Py_None) {
12206 if (PyUnicode_Check(sep))
12207 return _PyUnicode_XStrip(self, striptype, sep);
12208 else {
12209 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012210 "%s arg must be None or str",
12211 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012212 return NULL;
12213 }
12214 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012215
Benjamin Peterson14339b62009-01-31 16:36:08 +000012216 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012217}
12218
12219
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012220PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012221 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012222\n\
12223Return a copy of the string S with leading and trailing\n\
12224whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012225If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012226
12227static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012228unicode_strip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012229{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012230 if (PyTuple_GET_SIZE(args) == 0)
12231 return do_strip(self, BOTHSTRIP); /* Common case */
12232 else
12233 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012234}
12235
12236
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012237PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012238 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012239\n\
12240Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012241If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012242
12243static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012244unicode_lstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012245{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012246 if (PyTuple_GET_SIZE(args) == 0)
12247 return do_strip(self, LEFTSTRIP); /* Common case */
12248 else
12249 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012250}
12251
12252
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012253PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012254 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012255\n\
12256Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012257If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012258
12259static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012260unicode_rstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012261{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012262 if (PyTuple_GET_SIZE(args) == 0)
12263 return do_strip(self, RIGHTSTRIP); /* Common case */
12264 else
12265 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012266}
12267
12268
Guido van Rossumd57fd912000-03-10 22:53:23 +000012269static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012270unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012271{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012272 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012273 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012274
Serhiy Storchaka05997252013-01-26 12:14:02 +020012275 if (len < 1)
12276 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012277
Victor Stinnerc4b49542011-12-11 22:44:26 +010012278 /* no repeat, return original string */
12279 if (len == 1)
12280 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000012281
Benjamin Petersonbac79492012-01-14 13:34:47 -050012282 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012283 return NULL;
12284
Victor Stinnerc759f3e2011-10-01 03:09:58 +020012285 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020012286 PyErr_SetString(PyExc_OverflowError,
12287 "repeated string is too long");
12288 return NULL;
12289 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012290 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012291
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012292 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012293 if (!u)
12294 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012295 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012296
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012297 if (PyUnicode_GET_LENGTH(str) == 1) {
12298 const int kind = PyUnicode_KIND(str);
12299 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010012300 if (kind == PyUnicode_1BYTE_KIND) {
12301 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012302 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010012303 }
12304 else if (kind == PyUnicode_2BYTE_KIND) {
12305 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012306 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010012307 ucs2[n] = fill_char;
12308 } else {
12309 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12310 assert(kind == PyUnicode_4BYTE_KIND);
12311 for (n = 0; n < len; ++n)
12312 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012313 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012314 }
12315 else {
12316 /* number of characters copied this far */
12317 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012318 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012319 char *to = (char *) PyUnicode_DATA(u);
12320 Py_MEMCPY(to, PyUnicode_DATA(str),
12321 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000012322 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012323 n = (done <= nchars-done) ? done : nchars-done;
12324 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012325 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000012326 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012327 }
12328
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012329 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012330 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012331}
12332
Alexander Belopolsky40018472011-02-26 01:02:56 +000012333PyObject *
12334PyUnicode_Replace(PyObject *obj,
12335 PyObject *subobj,
12336 PyObject *replobj,
12337 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012338{
12339 PyObject *self;
12340 PyObject *str1;
12341 PyObject *str2;
12342 PyObject *result;
12343
12344 self = PyUnicode_FromObject(obj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012345 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012346 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012347 str1 = PyUnicode_FromObject(subobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012348 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012349 Py_DECREF(self);
12350 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012351 }
12352 str2 = PyUnicode_FromObject(replobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012353 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012354 Py_DECREF(self);
12355 Py_DECREF(str1);
12356 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012357 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060012358 if (PyUnicode_READY(self) == -1 ||
12359 PyUnicode_READY(str1) == -1 ||
12360 PyUnicode_READY(str2) == -1)
12361 result = NULL;
12362 else
12363 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012364 Py_DECREF(self);
12365 Py_DECREF(str1);
12366 Py_DECREF(str2);
12367 return result;
12368}
12369
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012370PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000012371 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012372\n\
12373Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000012374old replaced by new. If the optional argument count is\n\
12375given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012376
12377static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012378unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012379{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012380 PyObject *str1;
12381 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012382 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012383 PyObject *result;
12384
Martin v. Löwis18e16552006-02-15 17:27:45 +000012385 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012386 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060012387 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012388 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012389 str1 = PyUnicode_FromObject(str1);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012390 if (str1 == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012391 return NULL;
12392 str2 = PyUnicode_FromObject(str2);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012393 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012394 Py_DECREF(str1);
12395 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000012396 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060012397 if (PyUnicode_READY(str1) == -1 || PyUnicode_READY(str2) == -1)
12398 result = NULL;
12399 else
12400 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012401
12402 Py_DECREF(str1);
12403 Py_DECREF(str2);
12404 return result;
12405}
12406
Alexander Belopolsky40018472011-02-26 01:02:56 +000012407static PyObject *
12408unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012409{
Walter Dörwald79e913e2007-05-12 11:08:06 +000012410 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012411 Py_ssize_t isize;
12412 Py_ssize_t osize, squote, dquote, i, o;
12413 Py_UCS4 max, quote;
Victor Stinner55c08782013-04-14 18:45:39 +020012414 int ikind, okind, unchanged;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012415 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012416
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012417 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012418 return NULL;
12419
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012420 isize = PyUnicode_GET_LENGTH(unicode);
12421 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012422
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012423 /* Compute length of output, quote characters, and
12424 maximum character */
Victor Stinner55c08782013-04-14 18:45:39 +020012425 osize = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012426 max = 127;
12427 squote = dquote = 0;
12428 ikind = PyUnicode_KIND(unicode);
12429 for (i = 0; i < isize; i++) {
12430 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Benjamin Peterson736b8012014-09-29 23:02:15 -040012431 Py_ssize_t incr = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012432 switch (ch) {
Benjamin Peterson736b8012014-09-29 23:02:15 -040012433 case '\'': squote++; break;
12434 case '"': dquote++; break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012435 case '\\': case '\t': case '\r': case '\n':
Benjamin Peterson736b8012014-09-29 23:02:15 -040012436 incr = 2;
12437 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012438 default:
12439 /* Fast-path ASCII */
12440 if (ch < ' ' || ch == 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012441 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012442 else if (ch < 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012443 ;
12444 else if (Py_UNICODE_ISPRINTABLE(ch))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012445 max = ch > max ? ch : max;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012446 else if (ch < 0x100)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012447 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012448 else if (ch < 0x10000)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012449 incr = 6; /* \uHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012450 else
Benjamin Peterson736b8012014-09-29 23:02:15 -040012451 incr = 10; /* \uHHHHHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012452 }
Benjamin Peterson736b8012014-09-29 23:02:15 -040012453 if (osize > PY_SSIZE_T_MAX - incr) {
12454 PyErr_SetString(PyExc_OverflowError,
12455 "string is too long to generate repr");
12456 return NULL;
12457 }
12458 osize += incr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012459 }
12460
12461 quote = '\'';
Victor Stinner55c08782013-04-14 18:45:39 +020012462 unchanged = (osize == isize);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012463 if (squote) {
Victor Stinner55c08782013-04-14 18:45:39 +020012464 unchanged = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012465 if (dquote)
12466 /* Both squote and dquote present. Use squote,
12467 and escape them */
12468 osize += squote;
12469 else
12470 quote = '"';
12471 }
Victor Stinner55c08782013-04-14 18:45:39 +020012472 osize += 2; /* quotes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012473
12474 repr = PyUnicode_New(osize, max);
12475 if (repr == NULL)
12476 return NULL;
12477 okind = PyUnicode_KIND(repr);
12478 odata = PyUnicode_DATA(repr);
12479
12480 PyUnicode_WRITE(okind, odata, 0, quote);
12481 PyUnicode_WRITE(okind, odata, osize-1, quote);
Victor Stinner55c08782013-04-14 18:45:39 +020012482 if (unchanged) {
12483 _PyUnicode_FastCopyCharacters(repr, 1,
12484 unicode, 0,
12485 isize);
12486 }
12487 else {
12488 for (i = 0, o = 1; i < isize; i++) {
12489 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012490
Victor Stinner55c08782013-04-14 18:45:39 +020012491 /* Escape quotes and backslashes */
12492 if ((ch == quote) || (ch == '\\')) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012493 PyUnicode_WRITE(okind, odata, o++, '\\');
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012494 PyUnicode_WRITE(okind, odata, o++, ch);
Victor Stinner55c08782013-04-14 18:45:39 +020012495 continue;
12496 }
12497
12498 /* Map special whitespace to '\t', \n', '\r' */
12499 if (ch == '\t') {
12500 PyUnicode_WRITE(okind, odata, o++, '\\');
12501 PyUnicode_WRITE(okind, odata, o++, 't');
12502 }
12503 else if (ch == '\n') {
12504 PyUnicode_WRITE(okind, odata, o++, '\\');
12505 PyUnicode_WRITE(okind, odata, o++, 'n');
12506 }
12507 else if (ch == '\r') {
12508 PyUnicode_WRITE(okind, odata, o++, '\\');
12509 PyUnicode_WRITE(okind, odata, o++, 'r');
12510 }
12511
12512 /* Map non-printable US ASCII to '\xhh' */
12513 else if (ch < ' ' || ch == 0x7F) {
12514 PyUnicode_WRITE(okind, odata, o++, '\\');
12515 PyUnicode_WRITE(okind, odata, o++, 'x');
12516 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12517 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12518 }
12519
12520 /* Copy ASCII characters as-is */
12521 else if (ch < 0x7F) {
12522 PyUnicode_WRITE(okind, odata, o++, ch);
12523 }
12524
12525 /* Non-ASCII characters */
12526 else {
12527 /* Map Unicode whitespace and control characters
12528 (categories Z* and C* except ASCII space)
12529 */
12530 if (!Py_UNICODE_ISPRINTABLE(ch)) {
12531 PyUnicode_WRITE(okind, odata, o++, '\\');
12532 /* Map 8-bit characters to '\xhh' */
12533 if (ch <= 0xff) {
12534 PyUnicode_WRITE(okind, odata, o++, 'x');
12535 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12536 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12537 }
12538 /* Map 16-bit characters to '\uxxxx' */
12539 else if (ch <= 0xffff) {
12540 PyUnicode_WRITE(okind, odata, o++, 'u');
12541 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12542 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12543 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12544 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12545 }
12546 /* Map 21-bit characters to '\U00xxxxxx' */
12547 else {
12548 PyUnicode_WRITE(okind, odata, o++, 'U');
12549 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12550 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12551 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12552 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12553 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12554 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12555 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12556 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12557 }
12558 }
12559 /* Copy characters as-is */
12560 else {
12561 PyUnicode_WRITE(okind, odata, o++, ch);
12562 }
Georg Brandl559e5d72008-06-11 18:37:52 +000012563 }
12564 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012565 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012566 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012567 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012568 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012569}
12570
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012571PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012572 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012573\n\
12574Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012575such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012576arguments start and end are interpreted as in slice notation.\n\
12577\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012578Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012579
12580static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012581unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012582{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012583 /* initialize variables to prevent gcc warning */
12584 PyObject *substring = NULL;
12585 Py_ssize_t start = 0;
12586 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012587 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012588
Jesus Ceaac451502011-04-20 17:09:23 +020012589 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
12590 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012591 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012592
Christian Heimesea71a522013-06-29 21:17:34 +020012593 if (PyUnicode_READY(self) == -1) {
12594 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012595 return NULL;
Christian Heimesea71a522013-06-29 21:17:34 +020012596 }
12597 if (PyUnicode_READY(substring) == -1) {
12598 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012599 return NULL;
Christian Heimesea71a522013-06-29 21:17:34 +020012600 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012601
Victor Stinner7931d9a2011-11-04 00:22:48 +010012602 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012603
12604 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012605
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012606 if (result == -2)
12607 return NULL;
12608
Christian Heimes217cfd12007-12-02 14:31:20 +000012609 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012610}
12611
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012612PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012613 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012614\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012615Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012616
12617static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012618unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012619{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012620 /* initialize variables to prevent gcc warning */
12621 PyObject *substring = NULL;
12622 Py_ssize_t start = 0;
12623 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012624 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012625
Jesus Ceaac451502011-04-20 17:09:23 +020012626 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
12627 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012628 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012629
Christian Heimesea71a522013-06-29 21:17:34 +020012630 if (PyUnicode_READY(self) == -1) {
12631 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012632 return NULL;
Christian Heimesea71a522013-06-29 21:17:34 +020012633 }
12634 if (PyUnicode_READY(substring) == -1) {
12635 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012636 return NULL;
Christian Heimesea71a522013-06-29 21:17:34 +020012637 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012638
Victor Stinner7931d9a2011-11-04 00:22:48 +010012639 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012640
12641 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012642
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012643 if (result == -2)
12644 return NULL;
12645
Guido van Rossumd57fd912000-03-10 22:53:23 +000012646 if (result < 0) {
12647 PyErr_SetString(PyExc_ValueError, "substring not found");
12648 return NULL;
12649 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012650
Christian Heimes217cfd12007-12-02 14:31:20 +000012651 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012652}
12653
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012654PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012655 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012656\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012657Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012658done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012659
12660static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012661unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012662{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012663 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012664 Py_UCS4 fillchar = ' ';
12665
Victor Stinnere9a29352011-10-01 02:14:59 +020012666 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012667 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012668
Benjamin Petersonbac79492012-01-14 13:34:47 -050012669 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012670 return NULL;
12671
Victor Stinnerc4b49542011-12-11 22:44:26 +010012672 if (PyUnicode_GET_LENGTH(self) >= width)
12673 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012674
Victor Stinnerc4b49542011-12-11 22:44:26 +010012675 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012676}
12677
Alexander Belopolsky40018472011-02-26 01:02:56 +000012678PyObject *
12679PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012680{
12681 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000012682
Guido van Rossumd57fd912000-03-10 22:53:23 +000012683 s = PyUnicode_FromObject(s);
12684 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012685 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012686 if (sep != NULL) {
12687 sep = PyUnicode_FromObject(sep);
12688 if (sep == NULL) {
12689 Py_DECREF(s);
12690 return NULL;
12691 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012692 }
12693
Victor Stinner9310abb2011-10-05 00:59:23 +020012694 result = split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012695
12696 Py_DECREF(s);
12697 Py_XDECREF(sep);
12698 return result;
12699}
12700
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012701PyDoc_STRVAR(split__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012702 "S.split(sep=None, maxsplit=-1) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012703\n\
12704Return a list of the words in S, using sep as the\n\
12705delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000012706splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000012707whitespace string is a separator and empty strings are\n\
12708removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012709
12710static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012711unicode_split(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012712{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012713 static char *kwlist[] = {"sep", "maxsplit", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000012714 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012715 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012716
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012717 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:split",
12718 kwlist, &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012719 return NULL;
12720
12721 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012722 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012723 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012724 return split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012725 else
Victor Stinner7931d9a2011-11-04 00:22:48 +010012726 return PyUnicode_Split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012727}
12728
Thomas Wouters477c8d52006-05-27 19:21:47 +000012729PyObject *
12730PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
12731{
12732 PyObject* str_obj;
12733 PyObject* sep_obj;
12734 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012735 int kind1, kind2;
12736 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012737 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012738
12739 str_obj = PyUnicode_FromObject(str_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012740 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012741 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012742 sep_obj = PyUnicode_FromObject(sep_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012743 if (!sep_obj) {
12744 Py_DECREF(str_obj);
12745 return NULL;
12746 }
12747 if (PyUnicode_READY(sep_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
12748 Py_DECREF(sep_obj);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012749 Py_DECREF(str_obj);
12750 return NULL;
12751 }
12752
Victor Stinner14f8f022011-10-05 20:58:25 +020012753 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012754 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012755 len1 = PyUnicode_GET_LENGTH(str_obj);
12756 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012757 if (kind1 < kind2 || len1 < len2) {
12758 _Py_INCREF_UNICODE_EMPTY();
12759 if (!unicode_empty)
12760 out = NULL;
12761 else {
12762 out = PyTuple_Pack(3, str_obj, unicode_empty, unicode_empty);
12763 Py_DECREF(unicode_empty);
12764 }
12765 Py_DECREF(sep_obj);
12766 Py_DECREF(str_obj);
12767 return out;
12768 }
12769 buf1 = PyUnicode_DATA(str_obj);
12770 buf2 = PyUnicode_DATA(sep_obj);
12771 if (kind2 != kind1) {
12772 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
12773 if (!buf2)
12774 goto onError;
12775 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012776
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012777 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012778 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012779 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12780 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12781 else
12782 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012783 break;
12784 case PyUnicode_2BYTE_KIND:
12785 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12786 break;
12787 case PyUnicode_4BYTE_KIND:
12788 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12789 break;
12790 default:
12791 assert(0);
12792 out = 0;
12793 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012794
12795 Py_DECREF(sep_obj);
12796 Py_DECREF(str_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012797 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012798 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012799
12800 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012801 onError:
12802 Py_DECREF(sep_obj);
12803 Py_DECREF(str_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012804 if (kind2 != kind1 && buf2)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012805 PyMem_Free(buf2);
12806 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012807}
12808
12809
12810PyObject *
12811PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
12812{
12813 PyObject* str_obj;
12814 PyObject* sep_obj;
12815 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012816 int kind1, kind2;
12817 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012818 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012819
12820 str_obj = PyUnicode_FromObject(str_in);
12821 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012822 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012823 sep_obj = PyUnicode_FromObject(sep_in);
12824 if (!sep_obj) {
12825 Py_DECREF(str_obj);
12826 return NULL;
12827 }
12828
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012829 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012830 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012831 len1 = PyUnicode_GET_LENGTH(str_obj);
12832 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012833 if (kind1 < kind2 || len1 < len2) {
12834 _Py_INCREF_UNICODE_EMPTY();
12835 if (!unicode_empty)
12836 out = NULL;
12837 else {
12838 out = PyTuple_Pack(3, unicode_empty, unicode_empty, str_obj);
12839 Py_DECREF(unicode_empty);
12840 }
12841 Py_DECREF(sep_obj);
12842 Py_DECREF(str_obj);
12843 return out;
12844 }
12845 buf1 = PyUnicode_DATA(str_obj);
12846 buf2 = PyUnicode_DATA(sep_obj);
12847 if (kind2 != kind1) {
12848 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
12849 if (!buf2)
12850 goto onError;
12851 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012852
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012853 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012854 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012855 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12856 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12857 else
12858 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012859 break;
12860 case PyUnicode_2BYTE_KIND:
12861 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12862 break;
12863 case PyUnicode_4BYTE_KIND:
12864 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12865 break;
12866 default:
12867 assert(0);
12868 out = 0;
12869 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012870
12871 Py_DECREF(sep_obj);
12872 Py_DECREF(str_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012873 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012874 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012875
12876 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012877 onError:
12878 Py_DECREF(sep_obj);
12879 Py_DECREF(str_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012880 if (kind2 != kind1 && buf2)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012881 PyMem_Free(buf2);
12882 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012883}
12884
12885PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012886 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012887\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012888Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012889the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012890found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012891
12892static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012893unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012894{
Victor Stinner9310abb2011-10-05 00:59:23 +020012895 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012896}
12897
12898PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000012899 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012900\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012901Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012902the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012903separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012904
12905static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012906unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012907{
Victor Stinner9310abb2011-10-05 00:59:23 +020012908 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012909}
12910
Alexander Belopolsky40018472011-02-26 01:02:56 +000012911PyObject *
12912PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012913{
12914 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012915
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012916 s = PyUnicode_FromObject(s);
12917 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012918 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012919 if (sep != NULL) {
12920 sep = PyUnicode_FromObject(sep);
12921 if (sep == NULL) {
12922 Py_DECREF(s);
12923 return NULL;
12924 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012925 }
12926
Victor Stinner9310abb2011-10-05 00:59:23 +020012927 result = rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012928
12929 Py_DECREF(s);
12930 Py_XDECREF(sep);
12931 return result;
12932}
12933
12934PyDoc_STRVAR(rsplit__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012935 "S.rsplit(sep=None, maxsplit=-1) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012936\n\
12937Return a list of the words in S, using sep as the\n\
12938delimiter string, starting at the end of the string and\n\
12939working to the front. If maxsplit is given, at most maxsplit\n\
12940splits are done. If sep is not specified, any whitespace string\n\
12941is a separator.");
12942
12943static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012944unicode_rsplit(PyObject *self, PyObject *args, PyObject *kwds)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012945{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012946 static char *kwlist[] = {"sep", "maxsplit", 0};
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012947 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012948 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012949
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012950 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:rsplit",
12951 kwlist, &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012952 return NULL;
12953
12954 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012955 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012956 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012957 return rsplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012958 else
Victor Stinner9310abb2011-10-05 00:59:23 +020012959 return PyUnicode_RSplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012960}
12961
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012962PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012963 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012964\n\
12965Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000012966Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012967is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012968
12969static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012970unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012971{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012972 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000012973 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012974
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012975 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12976 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012977 return NULL;
12978
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012979 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012980}
12981
12982static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000012983PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012984{
Victor Stinnerc4b49542011-12-11 22:44:26 +010012985 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012986}
12987
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012988PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012989 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012990\n\
12991Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012992and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012993
12994static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012995unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012996{
Benjamin Petersoneea48462012-01-16 14:28:50 -050012997 if (PyUnicode_READY(self) == -1)
12998 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012999 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013000}
13001
Larry Hastings61272b72014-01-07 12:41:53 -080013002/*[clinic input]
Georg Brandlceee0772007-11-27 23:48:05 +000013003
Larry Hastings31826802013-10-19 00:09:25 -070013004@staticmethod
13005str.maketrans as unicode_maketrans
13006
13007 x: object
13008
13009 y: unicode=NULL
13010
13011 z: unicode=NULL
13012
13013 /
13014
13015Return a translation table usable for str.translate().
13016
13017If there is only one argument, it must be a dictionary mapping Unicode
13018ordinals (integers) or characters to Unicode ordinals, strings or None.
13019Character keys will be then converted to ordinals.
13020If there are two arguments, they must be strings of equal length, and
13021in the resulting dictionary, each character in x will be mapped to the
13022character at the same position in y. If there is a third argument, it
13023must be a string, whose characters will be mapped to None in the result.
Larry Hastings61272b72014-01-07 12:41:53 -080013024[clinic start generated code]*/
Larry Hastings31826802013-10-19 00:09:25 -070013025
Larry Hastings31826802013-10-19 00:09:25 -070013026static PyObject *
Larry Hastings5c661892014-01-24 06:17:25 -080013027unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
Serhiy Storchaka1009bf12015-04-03 23:53:51 +030013028/*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
Larry Hastings31826802013-10-19 00:09:25 -070013029{
Georg Brandlceee0772007-11-27 23:48:05 +000013030 PyObject *new = NULL, *key, *value;
13031 Py_ssize_t i = 0;
13032 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013033
Georg Brandlceee0772007-11-27 23:48:05 +000013034 new = PyDict_New();
13035 if (!new)
13036 return NULL;
13037 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013038 int x_kind, y_kind, z_kind;
13039 void *x_data, *y_data, *z_data;
13040
Georg Brandlceee0772007-11-27 23:48:05 +000013041 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000013042 if (!PyUnicode_Check(x)) {
13043 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
13044 "be a string if there is a second argument");
13045 goto err;
13046 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013047 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013048 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13049 "arguments must have equal length");
13050 goto err;
13051 }
13052 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013053 x_kind = PyUnicode_KIND(x);
13054 y_kind = PyUnicode_KIND(y);
13055 x_data = PyUnicode_DATA(x);
13056 y_data = PyUnicode_DATA(y);
13057 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13058 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013059 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000013060 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060013061 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013062 if (!value) {
13063 Py_DECREF(key);
13064 goto err;
13065 }
Georg Brandlceee0772007-11-27 23:48:05 +000013066 res = PyDict_SetItem(new, key, value);
13067 Py_DECREF(key);
13068 Py_DECREF(value);
13069 if (res < 0)
13070 goto err;
13071 }
13072 /* create entries for deleting chars in z */
13073 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013074 z_kind = PyUnicode_KIND(z);
13075 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013076 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013077 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000013078 if (!key)
13079 goto err;
13080 res = PyDict_SetItem(new, key, Py_None);
13081 Py_DECREF(key);
13082 if (res < 0)
13083 goto err;
13084 }
13085 }
13086 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013087 int kind;
13088 void *data;
13089
Georg Brandlceee0772007-11-27 23:48:05 +000013090 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000013091 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013092 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13093 "to maketrans it must be a dict");
13094 goto err;
13095 }
13096 /* copy entries into the new dict, converting string keys to int keys */
13097 while (PyDict_Next(x, &i, &key, &value)) {
13098 if (PyUnicode_Check(key)) {
13099 /* convert string keys to integer keys */
13100 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013101 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000013102 PyErr_SetString(PyExc_ValueError, "string keys in translate "
13103 "table must be of length 1");
13104 goto err;
13105 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013106 kind = PyUnicode_KIND(key);
13107 data = PyUnicode_DATA(key);
13108 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000013109 if (!newkey)
13110 goto err;
13111 res = PyDict_SetItem(new, newkey, value);
13112 Py_DECREF(newkey);
13113 if (res < 0)
13114 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000013115 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013116 /* just keep integer keys */
13117 if (PyDict_SetItem(new, key, value) < 0)
13118 goto err;
13119 } else {
13120 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13121 "be strings or integers");
13122 goto err;
13123 }
13124 }
13125 }
13126 return new;
13127 err:
13128 Py_DECREF(new);
13129 return NULL;
13130}
13131
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013132PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013133 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013134\n\
Zachary Ware79b98df2015-08-05 23:54:15 -050013135Return a copy of the string S in which each character has been mapped\n\
13136through the given translation table. The table must implement\n\
13137lookup/indexing via __getitem__, for instance a dictionary or list,\n\
13138mapping Unicode ordinals to Unicode ordinals, strings, or None. If\n\
13139this operation raises LookupError, the character is left untouched.\n\
13140Characters mapped to None are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013141
13142static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013143unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013144{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013145 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013146}
13147
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013148PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013149 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013150\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013151Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013152
13153static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020013154unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013155{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050013156 if (PyUnicode_READY(self) == -1)
13157 return NULL;
13158 if (PyUnicode_IS_ASCII(self))
13159 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013160 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013161}
13162
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013163PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013164 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013165\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000013166Pad a numeric string S with zeros on the left, to fill a field\n\
13167of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013168
13169static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020013170unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013171{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013172 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020013173 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013174 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013175 int kind;
13176 void *data;
13177 Py_UCS4 chr;
13178
Martin v. Löwis18e16552006-02-15 17:27:45 +000013179 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000013180 return NULL;
13181
Benjamin Petersonbac79492012-01-14 13:34:47 -050013182 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010013183 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013184
Victor Stinnerc4b49542011-12-11 22:44:26 +010013185 if (PyUnicode_GET_LENGTH(self) >= width)
13186 return unicode_result_unchanged(self);
13187
13188 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013189
13190 u = pad(self, fill, 0, '0');
13191
Walter Dörwald068325e2002-04-15 13:36:47 +000013192 if (u == NULL)
13193 return NULL;
13194
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013195 kind = PyUnicode_KIND(u);
13196 data = PyUnicode_DATA(u);
13197 chr = PyUnicode_READ(kind, data, fill);
13198
13199 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000013200 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013201 PyUnicode_WRITE(kind, data, 0, chr);
13202 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000013203 }
13204
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013205 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010013206 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013207}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013208
13209#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013210static PyObject *
13211unicode__decimal2ascii(PyObject *self)
13212{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013213 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013214}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013215#endif
13216
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013217PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013218 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013219\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013220Return True if S starts with the specified prefix, False otherwise.\n\
13221With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013222With optional end, stop comparing S at that position.\n\
13223prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013224
13225static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013226unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013227 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013228{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013229 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013230 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013231 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013232 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013233 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013234
Jesus Ceaac451502011-04-20 17:09:23 +020013235 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013236 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013237 if (PyTuple_Check(subobj)) {
13238 Py_ssize_t i;
13239 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013240 substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013241 if (substring == NULL)
13242 return NULL;
13243 result = tailmatch(self, substring, start, end, -1);
13244 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010013245 if (result == -1)
13246 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013247 if (result) {
13248 Py_RETURN_TRUE;
13249 }
13250 }
13251 /* nothing matched */
13252 Py_RETURN_FALSE;
13253 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013254 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030013255 if (substring == NULL) {
13256 if (PyErr_ExceptionMatches(PyExc_TypeError))
13257 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
13258 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013259 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013260 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013261 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013262 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010013263 if (result == -1)
13264 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013265 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013266}
13267
13268
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013269PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013270 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013271\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013272Return True if S ends with the specified suffix, False otherwise.\n\
13273With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013274With optional end, stop comparing S at that position.\n\
13275suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013276
13277static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013278unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013279 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013280{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013281 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013282 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013283 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013284 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013285 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013286
Jesus Ceaac451502011-04-20 17:09:23 +020013287 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013288 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013289 if (PyTuple_Check(subobj)) {
13290 Py_ssize_t i;
13291 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013292 substring = PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000013293 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013294 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000013295 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013296 result = tailmatch(self, substring, start, end, +1);
13297 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010013298 if (result == -1)
13299 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013300 if (result) {
13301 Py_RETURN_TRUE;
13302 }
13303 }
13304 Py_RETURN_FALSE;
13305 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013306 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030013307 if (substring == NULL) {
13308 if (PyErr_ExceptionMatches(PyExc_TypeError))
13309 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
13310 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013311 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013312 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013313 result = tailmatch(self, substring, start, end, +1);
Christian Heimes305e49e2013-06-29 20:41:06 +020013314 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010013315 if (result == -1)
13316 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013317 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013318}
13319
Victor Stinner202fdca2012-05-07 12:47:02 +020013320Py_LOCAL_INLINE(void)
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013321_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013322{
Victor Stinner8f674cc2013-04-17 23:02:17 +020013323 if (!writer->readonly)
13324 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
13325 else {
13326 /* Copy-on-write mode: set buffer size to 0 so
13327 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13328 * next write. */
13329 writer->size = 0;
13330 }
Victor Stinner202fdca2012-05-07 12:47:02 +020013331 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13332 writer->data = PyUnicode_DATA(writer->buffer);
13333 writer->kind = PyUnicode_KIND(writer->buffer);
13334}
13335
Victor Stinnerd3f08822012-05-29 12:57:52 +020013336void
Victor Stinner8f674cc2013-04-17 23:02:17 +020013337_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013338{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013339 memset(writer, 0, sizeof(*writer));
13340#ifdef Py_DEBUG
13341 writer->kind = 5; /* invalid kind */
13342#endif
Victor Stinner8f674cc2013-04-17 23:02:17 +020013343 writer->min_char = 127;
Victor Stinner202fdca2012-05-07 12:47:02 +020013344}
13345
Victor Stinnerd3f08822012-05-29 12:57:52 +020013346int
13347_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13348 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020013349{
Victor Stinner6989ba02013-11-18 21:08:39 +010013350#ifdef MS_WINDOWS
13351 /* On Windows, overallocate by 50% is the best factor */
13352# define OVERALLOCATE_FACTOR 2
13353#else
13354 /* On Linux, overallocate by 25% is the best factor */
13355# define OVERALLOCATE_FACTOR 4
13356#endif
Victor Stinner202fdca2012-05-07 12:47:02 +020013357 Py_ssize_t newlen;
13358 PyObject *newbuffer;
13359
Victor Stinnerd3f08822012-05-29 12:57:52 +020013360 assert(length > 0);
13361
Victor Stinner202fdca2012-05-07 12:47:02 +020013362 if (length > PY_SSIZE_T_MAX - writer->pos) {
13363 PyErr_NoMemory();
13364 return -1;
13365 }
13366 newlen = writer->pos + length;
13367
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070013368 maxchar = Py_MAX(maxchar, writer->min_char);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013369
Victor Stinnerd3f08822012-05-29 12:57:52 +020013370 if (writer->buffer == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020013371 assert(!writer->readonly);
Victor Stinner6989ba02013-11-18 21:08:39 +010013372 if (writer->overallocate
13373 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13374 /* overallocate to limit the number of realloc() */
13375 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013376 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013377 if (newlen < writer->min_length)
13378 newlen = writer->min_length;
13379
Victor Stinnerd3f08822012-05-29 12:57:52 +020013380 writer->buffer = PyUnicode_New(newlen, maxchar);
13381 if (writer->buffer == NULL)
13382 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013383 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013384 else if (newlen > writer->size) {
Victor Stinner6989ba02013-11-18 21:08:39 +010013385 if (writer->overallocate
13386 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13387 /* overallocate to limit the number of realloc() */
13388 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013389 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013390 if (newlen < writer->min_length)
13391 newlen = writer->min_length;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013392
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013393 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020013394 /* resize + widen */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +030013395 maxchar = Py_MAX(maxchar, writer->maxchar);
Victor Stinner202fdca2012-05-07 12:47:02 +020013396 newbuffer = PyUnicode_New(newlen, maxchar);
13397 if (newbuffer == NULL)
13398 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013399 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13400 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020013401 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013402 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020013403 }
13404 else {
13405 newbuffer = resize_compact(writer->buffer, newlen);
13406 if (newbuffer == NULL)
13407 return -1;
13408 }
13409 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013410 }
13411 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013412 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013413 newbuffer = PyUnicode_New(writer->size, maxchar);
13414 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020013415 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013416 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13417 writer->buffer, 0, writer->pos);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030013418 Py_SETREF(writer->buffer, newbuffer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013419 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013420 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013421 return 0;
Victor Stinner6989ba02013-11-18 21:08:39 +010013422
13423#undef OVERALLOCATE_FACTOR
Victor Stinner202fdca2012-05-07 12:47:02 +020013424}
13425
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013426Py_LOCAL_INLINE(int)
13427_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
Victor Stinnera0dd0212013-04-11 22:09:04 +020013428{
13429 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13430 return -1;
13431 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13432 writer->pos++;
13433 return 0;
13434}
13435
13436int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013437_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13438{
13439 return _PyUnicodeWriter_WriteCharInline(writer, ch);
13440}
13441
13442int
Victor Stinnerd3f08822012-05-29 12:57:52 +020013443_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13444{
13445 Py_UCS4 maxchar;
13446 Py_ssize_t len;
13447
13448 if (PyUnicode_READY(str) == -1)
13449 return -1;
13450 len = PyUnicode_GET_LENGTH(str);
13451 if (len == 0)
13452 return 0;
13453 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13454 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013455 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinner1912b392015-03-26 09:37:23 +010013456 assert(_PyUnicode_CheckConsistency(str, 1));
Victor Stinner8f674cc2013-04-17 23:02:17 +020013457 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013458 Py_INCREF(str);
13459 writer->buffer = str;
13460 _PyUnicodeWriter_Update(writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013461 writer->pos += len;
13462 return 0;
13463 }
13464 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13465 return -1;
13466 }
13467 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13468 str, 0, len);
13469 writer->pos += len;
13470 return 0;
13471}
13472
Victor Stinnere215d962012-10-06 23:03:36 +020013473int
Victor Stinnercfc4c132013-04-03 01:48:39 +020013474_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13475 Py_ssize_t start, Py_ssize_t end)
13476{
13477 Py_UCS4 maxchar;
13478 Py_ssize_t len;
13479
13480 if (PyUnicode_READY(str) == -1)
13481 return -1;
13482
13483 assert(0 <= start);
13484 assert(end <= PyUnicode_GET_LENGTH(str));
13485 assert(start <= end);
13486
13487 if (end == 0)
13488 return 0;
13489
13490 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
13491 return _PyUnicodeWriter_WriteStr(writer, str);
13492
13493 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
13494 maxchar = _PyUnicode_FindMaxChar(str, start, end);
13495 else
13496 maxchar = writer->maxchar;
13497 len = end - start;
13498
13499 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
13500 return -1;
13501
13502 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13503 str, start, len);
13504 writer->pos += len;
13505 return 0;
13506}
13507
13508int
Victor Stinner4a587072013-11-19 12:54:53 +010013509_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
13510 const char *ascii, Py_ssize_t len)
13511{
13512 if (len == -1)
13513 len = strlen(ascii);
13514
13515 assert(ucs1lib_find_max_char((Py_UCS1*)ascii, (Py_UCS1*)ascii + len) < 128);
13516
13517 if (writer->buffer == NULL && !writer->overallocate) {
13518 PyObject *str;
13519
13520 str = _PyUnicode_FromASCII(ascii, len);
13521 if (str == NULL)
13522 return -1;
13523
13524 writer->readonly = 1;
13525 writer->buffer = str;
13526 _PyUnicodeWriter_Update(writer);
13527 writer->pos += len;
13528 return 0;
13529 }
13530
13531 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
13532 return -1;
13533
13534 switch (writer->kind)
13535 {
13536 case PyUnicode_1BYTE_KIND:
13537 {
13538 const Py_UCS1 *str = (const Py_UCS1 *)ascii;
13539 Py_UCS1 *data = writer->data;
13540
13541 Py_MEMCPY(data + writer->pos, str, len);
13542 break;
13543 }
13544 case PyUnicode_2BYTE_KIND:
13545 {
13546 _PyUnicode_CONVERT_BYTES(
13547 Py_UCS1, Py_UCS2,
13548 ascii, ascii + len,
13549 (Py_UCS2 *)writer->data + writer->pos);
13550 break;
13551 }
13552 case PyUnicode_4BYTE_KIND:
13553 {
13554 _PyUnicode_CONVERT_BYTES(
13555 Py_UCS1, Py_UCS4,
13556 ascii, ascii + len,
13557 (Py_UCS4 *)writer->data + writer->pos);
13558 break;
13559 }
13560 default:
13561 assert(0);
13562 }
13563
13564 writer->pos += len;
13565 return 0;
13566}
13567
13568int
13569_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
13570 const char *str, Py_ssize_t len)
Victor Stinnere215d962012-10-06 23:03:36 +020013571{
13572 Py_UCS4 maxchar;
13573
13574 maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len);
13575 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
13576 return -1;
13577 unicode_write_cstr(writer->buffer, writer->pos, str, len);
13578 writer->pos += len;
13579 return 0;
13580}
13581
Victor Stinnerd3f08822012-05-29 12:57:52 +020013582PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013583_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013584{
Victor Stinner15a0bd32013-07-08 22:29:55 +020013585 PyObject *str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013586 if (writer->pos == 0) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013587 Py_CLEAR(writer->buffer);
Serhiy Storchaka678db842013-01-26 12:16:36 +020013588 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3f08822012-05-29 12:57:52 +020013589 }
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013590 if (writer->readonly) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013591 str = writer->buffer;
13592 writer->buffer = NULL;
13593 assert(PyUnicode_GET_LENGTH(str) == writer->pos);
13594 return str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013595 }
13596 if (PyUnicode_GET_LENGTH(writer->buffer) != writer->pos) {
13597 PyObject *newbuffer;
13598 newbuffer = resize_compact(writer->buffer, writer->pos);
13599 if (newbuffer == NULL) {
Serhiy Storchakadfe98a12014-02-09 13:46:20 +020013600 Py_CLEAR(writer->buffer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013601 return NULL;
13602 }
13603 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013604 }
Victor Stinner15a0bd32013-07-08 22:29:55 +020013605 str = writer->buffer;
13606 writer->buffer = NULL;
13607 assert(_PyUnicode_CheckConsistency(str, 1));
13608 return unicode_result_ready(str);
Victor Stinner202fdca2012-05-07 12:47:02 +020013609}
13610
Victor Stinnerd3f08822012-05-29 12:57:52 +020013611void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013612_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013613{
13614 Py_CLEAR(writer->buffer);
13615}
13616
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013617#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000013618
13619PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013620 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013621\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013622Return a formatted version of S, using substitutions from args and kwargs.\n\
13623The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000013624
Eric Smith27bbca62010-11-04 17:06:58 +000013625PyDoc_STRVAR(format_map__doc__,
13626 "S.format_map(mapping) -> str\n\
13627\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013628Return a formatted version of S, using substitutions from mapping.\n\
13629The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000013630
Eric Smith4a7d76d2008-05-30 18:10:19 +000013631static PyObject *
13632unicode__format__(PyObject* self, PyObject* args)
13633{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013634 PyObject *format_spec;
13635 _PyUnicodeWriter writer;
13636 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000013637
13638 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
13639 return NULL;
13640
Victor Stinnerd3f08822012-05-29 12:57:52 +020013641 if (PyUnicode_READY(self) == -1)
13642 return NULL;
Victor Stinner8f674cc2013-04-17 23:02:17 +020013643 _PyUnicodeWriter_Init(&writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013644 ret = _PyUnicode_FormatAdvancedWriter(&writer,
13645 self, format_spec, 0,
13646 PyUnicode_GET_LENGTH(format_spec));
13647 if (ret == -1) {
13648 _PyUnicodeWriter_Dealloc(&writer);
13649 return NULL;
13650 }
13651 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000013652}
13653
Eric Smith8c663262007-08-25 02:26:07 +000013654PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013655 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013656\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013657Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000013658
13659static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013660unicode__sizeof__(PyObject *v)
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013661{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013662 Py_ssize_t size;
13663
13664 /* If it's a compact object, account for base structure +
13665 character data. */
13666 if (PyUnicode_IS_COMPACT_ASCII(v))
13667 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
13668 else if (PyUnicode_IS_COMPACT(v))
13669 size = sizeof(PyCompactUnicodeObject) +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013670 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013671 else {
13672 /* If it is a two-block object, account for base object, and
13673 for character block if present. */
13674 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020013675 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013676 size += (PyUnicode_GET_LENGTH(v) + 1) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013677 PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013678 }
13679 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020013680 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020013681 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013682 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020013683 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020013684 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013685
13686 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013687}
13688
13689PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013690 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013691
13692static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020013693unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013694{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010013695 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013696 if (!copy)
13697 return NULL;
13698 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013699}
13700
Guido van Rossumd57fd912000-03-10 22:53:23 +000013701static PyMethodDef unicode_methods[] = {
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000013702 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013703 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
Ezio Melotticda6b6d2012-02-26 09:39:55 +020013704 {"split", (PyCFunction) unicode_split, METH_VARARGS | METH_KEYWORDS, split__doc__},
13705 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS | METH_KEYWORDS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013706 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
13707 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
Benjamin Petersond5890c82012-01-14 13:23:30 -050013708 {"casefold", (PyCFunction) unicode_casefold, METH_NOARGS, casefold__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013709 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
13710 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
13711 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
Ezio Melotti745d54d2013-11-16 19:10:57 +020013712 {"expandtabs", (PyCFunction) unicode_expandtabs,
13713 METH_VARARGS | METH_KEYWORDS, expandtabs__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013714 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013715 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013716 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
13717 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
13718 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013719 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013720 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13721 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
13722 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013723 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013724 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Ezio Melotti745d54d2013-11-16 19:10:57 +020013725 {"splitlines", (PyCFunction) unicode_splitlines,
13726 METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013727 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013728 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
13729 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
13730 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
13731 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13732 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
13733 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
13734 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
13735 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
13736 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
13737 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
13738 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
13739 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
13740 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
13741 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000013742 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000013743 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013744 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000013745 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000013746 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000013747 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Larry Hastings31826802013-10-19 00:09:25 -070013748 UNICODE_MAKETRANS_METHODDEF
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013749 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000013750#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013751 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013752 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013753#endif
13754
Benjamin Peterson14339b62009-01-31 16:36:08 +000013755 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013756 {NULL, NULL}
13757};
13758
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013759static PyObject *
13760unicode_mod(PyObject *v, PyObject *w)
13761{
Brian Curtindfc80e32011-08-10 20:28:54 -050013762 if (!PyUnicode_Check(v))
13763 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000013764 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013765}
13766
13767static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013768 0, /*nb_add*/
13769 0, /*nb_subtract*/
13770 0, /*nb_multiply*/
13771 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013772};
13773
Guido van Rossumd57fd912000-03-10 22:53:23 +000013774static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013775 (lenfunc) unicode_length, /* sq_length */
13776 PyUnicode_Concat, /* sq_concat */
13777 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13778 (ssizeargfunc) unicode_getitem, /* sq_item */
13779 0, /* sq_slice */
13780 0, /* sq_ass_item */
13781 0, /* sq_ass_slice */
13782 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013783};
13784
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013785static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013786unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013787{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013788 if (PyUnicode_READY(self) == -1)
13789 return NULL;
13790
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013791 if (PyIndex_Check(item)) {
13792 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013793 if (i == -1 && PyErr_Occurred())
13794 return NULL;
13795 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013796 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013797 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013798 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000013799 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013800 PyObject *result;
13801 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013802 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013803 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013804
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013805 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000013806 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013807 return NULL;
13808 }
13809
13810 if (slicelength <= 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020013811 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013812 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010013813 slicelength == PyUnicode_GET_LENGTH(self)) {
13814 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000013815 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013816 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013817 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013818 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013819 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013820 src_kind = PyUnicode_KIND(self);
13821 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020013822 if (!PyUnicode_IS_ASCII(self)) {
13823 kind_limit = kind_maxchar_limit(src_kind);
13824 max_char = 0;
13825 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13826 ch = PyUnicode_READ(src_kind, src_data, cur);
13827 if (ch > max_char) {
13828 max_char = ch;
13829 if (max_char >= kind_limit)
13830 break;
13831 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013832 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013833 }
Victor Stinner55c99112011-10-13 01:17:06 +020013834 else
13835 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013836 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013837 if (result == NULL)
13838 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013839 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013840 dest_data = PyUnicode_DATA(result);
13841
13842 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013843 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13844 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013845 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013846 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013847 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013848 } else {
13849 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13850 return NULL;
13851 }
13852}
13853
13854static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013855 (lenfunc)unicode_length, /* mp_length */
13856 (binaryfunc)unicode_subscript, /* mp_subscript */
13857 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013858};
13859
Guido van Rossumd57fd912000-03-10 22:53:23 +000013860
Guido van Rossumd57fd912000-03-10 22:53:23 +000013861/* Helpers for PyUnicode_Format() */
13862
Victor Stinnera47082312012-10-04 02:19:54 +020013863struct unicode_formatter_t {
13864 PyObject *args;
13865 int args_owned;
13866 Py_ssize_t arglen, argidx;
13867 PyObject *dict;
13868
13869 enum PyUnicode_Kind fmtkind;
13870 Py_ssize_t fmtcnt, fmtpos;
13871 void *fmtdata;
13872 PyObject *fmtstr;
13873
13874 _PyUnicodeWriter writer;
13875};
13876
13877struct unicode_format_arg_t {
13878 Py_UCS4 ch;
13879 int flags;
13880 Py_ssize_t width;
13881 int prec;
13882 int sign;
13883};
13884
Guido van Rossumd57fd912000-03-10 22:53:23 +000013885static PyObject *
Victor Stinnera47082312012-10-04 02:19:54 +020013886unicode_format_getnextarg(struct unicode_formatter_t *ctx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013887{
Victor Stinnera47082312012-10-04 02:19:54 +020013888 Py_ssize_t argidx = ctx->argidx;
13889
13890 if (argidx < ctx->arglen) {
13891 ctx->argidx++;
13892 if (ctx->arglen < 0)
13893 return ctx->args;
Benjamin Peterson29060642009-01-31 22:14:21 +000013894 else
Victor Stinnera47082312012-10-04 02:19:54 +020013895 return PyTuple_GetItem(ctx->args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013896 }
13897 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013898 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013899 return NULL;
13900}
13901
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013902/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013903
Victor Stinnera47082312012-10-04 02:19:54 +020013904/* Format a float into the writer if the writer is not NULL, or into *p_output
13905 otherwise.
13906
13907 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +020013908static int
Victor Stinnera47082312012-10-04 02:19:54 +020013909formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
13910 PyObject **p_output,
13911 _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013912{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013913 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013914 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013915 Py_ssize_t len;
Victor Stinnera47082312012-10-04 02:19:54 +020013916 int prec;
13917 int dtoa_flags;
Tim Petersced69f82003-09-16 20:30:58 +000013918
Guido van Rossumd57fd912000-03-10 22:53:23 +000013919 x = PyFloat_AsDouble(v);
13920 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020013921 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013922
Victor Stinnera47082312012-10-04 02:19:54 +020013923 prec = arg->prec;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013924 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013925 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000013926
Victor Stinnera47082312012-10-04 02:19:54 +020013927 if (arg->flags & F_ALT)
13928 dtoa_flags = Py_DTSF_ALT;
13929 else
13930 dtoa_flags = 0;
13931 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013932 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020013933 return -1;
13934 len = strlen(p);
13935 if (writer) {
Victor Stinner4a587072013-11-19 12:54:53 +010013936 if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
Christian Heimesf4f99392012-09-10 11:48:41 +020013937 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013938 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020013939 }
Victor Stinnerd3f08822012-05-29 12:57:52 +020013940 }
13941 else
13942 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000013943 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013944 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013945}
13946
Victor Stinnerd0880d52012-04-27 23:40:13 +020013947/* formatlong() emulates the format codes d, u, o, x and X, and
13948 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
13949 * Python's regular ints.
13950 * Return value: a new PyUnicodeObject*, or NULL if error.
13951 * The output string is of the form
13952 * "-"? ("0x" | "0X")? digit+
13953 * "0x"/"0X" are present only for x and X conversions, with F_ALT
13954 * set in flags. The case of hex digits will be correct,
13955 * There will be at least prec digits, zero-filled on the left if
13956 * necessary to get that many.
13957 * val object to be converted
13958 * flags bitmask of format flags; only F_ALT is looked at
13959 * prec minimum number of digits; 0-fill on left if needed
13960 * type a character in [duoxX]; u acts the same as d
13961 *
13962 * CAUTION: o, x and X conversions on regular ints can never
13963 * produce a '-' sign, but can for Python's unbounded ints.
13964 */
Ethan Furmanb95b5612015-01-23 20:05:18 -080013965PyObject *
13966_PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
Tim Peters38fd5b62000-09-21 05:43:11 +000013967{
Victor Stinnerd0880d52012-04-27 23:40:13 +020013968 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013969 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013970 Py_ssize_t i;
13971 int sign; /* 1 if '-', else 0 */
13972 int len; /* number of characters */
13973 Py_ssize_t llen;
13974 int numdigits; /* len == numnondigits + numdigits */
13975 int numnondigits = 0;
Tim Peters38fd5b62000-09-21 05:43:11 +000013976
Victor Stinnerd0880d52012-04-27 23:40:13 +020013977 /* Avoid exceeding SSIZE_T_MAX */
13978 if (prec > INT_MAX-3) {
13979 PyErr_SetString(PyExc_OverflowError,
13980 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000013981 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013982 }
13983
13984 assert(PyLong_Check(val));
13985
13986 switch (type) {
Victor Stinner621ef3d2012-10-02 00:33:47 +020013987 default:
13988 assert(!"'type' not in [diuoxX]");
Victor Stinnerd0880d52012-04-27 23:40:13 +020013989 case 'd':
Victor Stinner621ef3d2012-10-02 00:33:47 +020013990 case 'i':
Victor Stinnerd0880d52012-04-27 23:40:13 +020013991 case 'u':
Ethan Furmanfb137212013-08-31 10:18:55 -070013992 /* int and int subclasses should print numerically when a numeric */
13993 /* format code is used (see issue18780) */
13994 result = PyNumber_ToBase(val, 10);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013995 break;
13996 case 'o':
13997 numnondigits = 2;
13998 result = PyNumber_ToBase(val, 8);
13999 break;
14000 case 'x':
14001 case 'X':
14002 numnondigits = 2;
14003 result = PyNumber_ToBase(val, 16);
14004 break;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014005 }
14006 if (!result)
14007 return NULL;
14008
14009 assert(unicode_modifiable(result));
14010 assert(PyUnicode_IS_READY(result));
14011 assert(PyUnicode_IS_ASCII(result));
14012
14013 /* To modify the string in-place, there can only be one reference. */
14014 if (Py_REFCNT(result) != 1) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014015 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014016 PyErr_BadInternalCall();
14017 return NULL;
14018 }
14019 buf = PyUnicode_DATA(result);
14020 llen = PyUnicode_GET_LENGTH(result);
14021 if (llen > INT_MAX) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014022 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014023 PyErr_SetString(PyExc_ValueError,
Ethan Furmanb95b5612015-01-23 20:05:18 -080014024 "string too large in _PyUnicode_FormatLong");
Victor Stinnerd0880d52012-04-27 23:40:13 +020014025 return NULL;
14026 }
14027 len = (int)llen;
14028 sign = buf[0] == '-';
14029 numnondigits += sign;
14030 numdigits = len - numnondigits;
14031 assert(numdigits > 0);
14032
14033 /* Get rid of base marker unless F_ALT */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014034 if (((alt) == 0 &&
Victor Stinnerd0880d52012-04-27 23:40:13 +020014035 (type == 'o' || type == 'x' || type == 'X'))) {
14036 assert(buf[sign] == '0');
14037 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
14038 buf[sign+1] == 'o');
14039 numnondigits -= 2;
14040 buf += 2;
14041 len -= 2;
14042 if (sign)
14043 buf[0] = '-';
14044 assert(len == numnondigits + numdigits);
14045 assert(numdigits > 0);
14046 }
14047
14048 /* Fill with leading zeroes to meet minimum width. */
14049 if (prec > numdigits) {
14050 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
14051 numnondigits + prec);
14052 char *b1;
14053 if (!r1) {
14054 Py_DECREF(result);
14055 return NULL;
14056 }
14057 b1 = PyBytes_AS_STRING(r1);
14058 for (i = 0; i < numnondigits; ++i)
14059 *b1++ = *buf++;
14060 for (i = 0; i < prec - numdigits; i++)
14061 *b1++ = '0';
14062 for (i = 0; i < numdigits; i++)
14063 *b1++ = *buf++;
14064 *b1 = '\0';
14065 Py_DECREF(result);
14066 result = r1;
14067 buf = PyBytes_AS_STRING(result);
14068 len = numnondigits + prec;
14069 }
14070
14071 /* Fix up case for hex conversions. */
14072 if (type == 'X') {
14073 /* Need to convert all lower case letters to upper case.
14074 and need to convert 0x to 0X (and -0x to -0X). */
14075 for (i = 0; i < len; i++)
14076 if (buf[i] >= 'a' && buf[i] <= 'x')
14077 buf[i] -= 'a'-'A';
14078 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014079 if (!PyUnicode_Check(result)
14080 || buf != PyUnicode_DATA(result)) {
Victor Stinnerd0880d52012-04-27 23:40:13 +020014081 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014082 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014083 Py_DECREF(result);
14084 result = unicode;
14085 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014086 else if (len != PyUnicode_GET_LENGTH(result)) {
14087 if (PyUnicode_Resize(&result, len) < 0)
14088 Py_CLEAR(result);
14089 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000014090 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000014091}
14092
Ethan Furmandf3ed242014-01-05 06:50:30 -080014093/* Format an integer or a float as an integer.
Victor Stinner621ef3d2012-10-02 00:33:47 +020014094 * Return 1 if the number has been formatted into the writer,
Victor Stinnera47082312012-10-04 02:19:54 +020014095 * 0 if the number has been formatted into *p_output
Victor Stinner621ef3d2012-10-02 00:33:47 +020014096 * -1 and raise an exception on error */
14097static int
Victor Stinnera47082312012-10-04 02:19:54 +020014098mainformatlong(PyObject *v,
14099 struct unicode_format_arg_t *arg,
14100 PyObject **p_output,
14101 _PyUnicodeWriter *writer)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014102{
14103 PyObject *iobj, *res;
Victor Stinnera47082312012-10-04 02:19:54 +020014104 char type = (char)arg->ch;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014105
14106 if (!PyNumber_Check(v))
14107 goto wrongtype;
14108
Ethan Furman9ab74802014-03-21 06:38:46 -070014109 /* make sure number is a type of integer for o, x, and X */
Victor Stinner621ef3d2012-10-02 00:33:47 +020014110 if (!PyLong_Check(v)) {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014111 if (type == 'o' || type == 'x' || type == 'X') {
14112 iobj = PyNumber_Index(v);
14113 if (iobj == NULL) {
Ethan Furman9ab74802014-03-21 06:38:46 -070014114 if (PyErr_ExceptionMatches(PyExc_TypeError))
14115 goto wrongtype;
Ethan Furman38d872e2014-03-19 08:38:52 -070014116 return -1;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014117 }
14118 }
14119 else {
14120 iobj = PyNumber_Long(v);
14121 if (iobj == NULL ) {
14122 if (PyErr_ExceptionMatches(PyExc_TypeError))
14123 goto wrongtype;
14124 return -1;
14125 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014126 }
14127 assert(PyLong_Check(iobj));
14128 }
14129 else {
14130 iobj = v;
14131 Py_INCREF(iobj);
14132 }
14133
14134 if (PyLong_CheckExact(v)
Victor Stinnera47082312012-10-04 02:19:54 +020014135 && arg->width == -1 && arg->prec == -1
14136 && !(arg->flags & (F_SIGN | F_BLANK))
14137 && type != 'X')
Victor Stinner621ef3d2012-10-02 00:33:47 +020014138 {
14139 /* Fast path */
Victor Stinnera47082312012-10-04 02:19:54 +020014140 int alternate = arg->flags & F_ALT;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014141 int base;
14142
Victor Stinnera47082312012-10-04 02:19:54 +020014143 switch(type)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014144 {
14145 default:
14146 assert(0 && "'type' not in [diuoxX]");
14147 case 'd':
14148 case 'i':
14149 case 'u':
14150 base = 10;
14151 break;
14152 case 'o':
14153 base = 8;
14154 break;
14155 case 'x':
14156 case 'X':
14157 base = 16;
14158 break;
14159 }
14160
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014161 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14162 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014163 return -1;
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014164 }
14165 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014166 return 1;
14167 }
14168
Ethan Furmanb95b5612015-01-23 20:05:18 -080014169 res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014170 Py_DECREF(iobj);
14171 if (res == NULL)
14172 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014173 *p_output = res;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014174 return 0;
14175
14176wrongtype:
Ethan Furman9ab74802014-03-21 06:38:46 -070014177 switch(type)
14178 {
14179 case 'o':
14180 case 'x':
14181 case 'X':
14182 PyErr_Format(PyExc_TypeError,
14183 "%%%c format: an integer is required, "
14184 "not %.200s",
14185 type, Py_TYPE(v)->tp_name);
14186 break;
14187 default:
14188 PyErr_Format(PyExc_TypeError,
14189 "%%%c format: a number is required, "
14190 "not %.200s",
14191 type, Py_TYPE(v)->tp_name);
14192 break;
14193 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014194 return -1;
14195}
14196
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014197static Py_UCS4
14198formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014199{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014200 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014201 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014202 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014203 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000014204 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014205 goto onError;
14206 }
14207 else {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014208 PyObject *iobj;
Benjamin Peterson29060642009-01-31 22:14:21 +000014209 long x;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014210 /* make sure number is a type of integer */
14211 if (!PyLong_Check(v)) {
14212 iobj = PyNumber_Index(v);
14213 if (iobj == NULL) {
Ethan Furman38d872e2014-03-19 08:38:52 -070014214 goto onError;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014215 }
14216 v = iobj;
14217 Py_DECREF(iobj);
14218 }
14219 /* Integer input truncated to a character */
Benjamin Peterson29060642009-01-31 22:14:21 +000014220 x = PyLong_AsLong(v);
14221 if (x == -1 && PyErr_Occurred())
14222 goto onError;
14223
Victor Stinner8faf8212011-12-08 22:14:11 +010014224 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014225 PyErr_SetString(PyExc_OverflowError,
14226 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014227 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000014228 }
14229
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014230 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014231 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014232
Benjamin Peterson29060642009-01-31 22:14:21 +000014233 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014234 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014235 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014236 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014237}
14238
Victor Stinnera47082312012-10-04 02:19:54 +020014239/* Parse options of an argument: flags, width, precision.
14240 Handle also "%(name)" syntax.
14241
14242 Return 0 if the argument has been formatted into arg->str.
14243 Return 1 if the argument has been written into ctx->writer,
14244 Raise an exception and return -1 on error. */
14245static int
14246unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14247 struct unicode_format_arg_t *arg)
14248{
14249#define FORMAT_READ(ctx) \
14250 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14251
14252 PyObject *v;
14253
Victor Stinnera47082312012-10-04 02:19:54 +020014254 if (arg->ch == '(') {
14255 /* Get argument value from a dictionary. Example: "%(name)s". */
14256 Py_ssize_t keystart;
14257 Py_ssize_t keylen;
14258 PyObject *key;
14259 int pcount = 1;
14260
14261 if (ctx->dict == NULL) {
14262 PyErr_SetString(PyExc_TypeError,
14263 "format requires a mapping");
14264 return -1;
14265 }
14266 ++ctx->fmtpos;
14267 --ctx->fmtcnt;
14268 keystart = ctx->fmtpos;
14269 /* Skip over balanced parentheses */
14270 while (pcount > 0 && --ctx->fmtcnt >= 0) {
14271 arg->ch = FORMAT_READ(ctx);
14272 if (arg->ch == ')')
14273 --pcount;
14274 else if (arg->ch == '(')
14275 ++pcount;
14276 ctx->fmtpos++;
14277 }
14278 keylen = ctx->fmtpos - keystart - 1;
14279 if (ctx->fmtcnt < 0 || pcount > 0) {
14280 PyErr_SetString(PyExc_ValueError,
14281 "incomplete format key");
14282 return -1;
14283 }
14284 key = PyUnicode_Substring(ctx->fmtstr,
14285 keystart, keystart + keylen);
14286 if (key == NULL)
14287 return -1;
14288 if (ctx->args_owned) {
Victor Stinnera47082312012-10-04 02:19:54 +020014289 ctx->args_owned = 0;
Serhiy Storchaka191321d2015-12-27 15:41:34 +020014290 Py_DECREF(ctx->args);
Victor Stinnera47082312012-10-04 02:19:54 +020014291 }
14292 ctx->args = PyObject_GetItem(ctx->dict, key);
14293 Py_DECREF(key);
14294 if (ctx->args == NULL)
14295 return -1;
14296 ctx->args_owned = 1;
14297 ctx->arglen = -1;
14298 ctx->argidx = -2;
14299 }
14300
14301 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
Victor Stinnera47082312012-10-04 02:19:54 +020014302 while (--ctx->fmtcnt >= 0) {
14303 arg->ch = FORMAT_READ(ctx);
14304 ctx->fmtpos++;
14305 switch (arg->ch) {
14306 case '-': arg->flags |= F_LJUST; continue;
14307 case '+': arg->flags |= F_SIGN; continue;
14308 case ' ': arg->flags |= F_BLANK; continue;
14309 case '#': arg->flags |= F_ALT; continue;
14310 case '0': arg->flags |= F_ZERO; continue;
14311 }
14312 break;
14313 }
14314
14315 /* Parse width. Example: "%10s" => width=10 */
Victor Stinnera47082312012-10-04 02:19:54 +020014316 if (arg->ch == '*') {
14317 v = unicode_format_getnextarg(ctx);
14318 if (v == NULL)
14319 return -1;
14320 if (!PyLong_Check(v)) {
14321 PyErr_SetString(PyExc_TypeError,
14322 "* wants int");
14323 return -1;
14324 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014325 arg->width = PyLong_AsSsize_t(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014326 if (arg->width == -1 && PyErr_Occurred())
14327 return -1;
14328 if (arg->width < 0) {
14329 arg->flags |= F_LJUST;
14330 arg->width = -arg->width;
14331 }
14332 if (--ctx->fmtcnt >= 0) {
14333 arg->ch = FORMAT_READ(ctx);
14334 ctx->fmtpos++;
14335 }
14336 }
14337 else if (arg->ch >= '0' && arg->ch <= '9') {
14338 arg->width = arg->ch - '0';
14339 while (--ctx->fmtcnt >= 0) {
14340 arg->ch = FORMAT_READ(ctx);
14341 ctx->fmtpos++;
14342 if (arg->ch < '0' || arg->ch > '9')
14343 break;
14344 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14345 mixing signed and unsigned comparison. Since arg->ch is between
14346 '0' and '9', casting to int is safe. */
14347 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14348 PyErr_SetString(PyExc_ValueError,
14349 "width too big");
14350 return -1;
14351 }
14352 arg->width = arg->width*10 + (arg->ch - '0');
14353 }
14354 }
14355
14356 /* Parse precision. Example: "%.3f" => prec=3 */
Victor Stinnera47082312012-10-04 02:19:54 +020014357 if (arg->ch == '.') {
14358 arg->prec = 0;
14359 if (--ctx->fmtcnt >= 0) {
14360 arg->ch = FORMAT_READ(ctx);
14361 ctx->fmtpos++;
14362 }
14363 if (arg->ch == '*') {
14364 v = unicode_format_getnextarg(ctx);
14365 if (v == NULL)
14366 return -1;
14367 if (!PyLong_Check(v)) {
14368 PyErr_SetString(PyExc_TypeError,
14369 "* wants int");
14370 return -1;
14371 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014372 arg->prec = _PyLong_AsInt(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014373 if (arg->prec == -1 && PyErr_Occurred())
14374 return -1;
14375 if (arg->prec < 0)
14376 arg->prec = 0;
14377 if (--ctx->fmtcnt >= 0) {
14378 arg->ch = FORMAT_READ(ctx);
14379 ctx->fmtpos++;
14380 }
14381 }
14382 else if (arg->ch >= '0' && arg->ch <= '9') {
14383 arg->prec = arg->ch - '0';
14384 while (--ctx->fmtcnt >= 0) {
14385 arg->ch = FORMAT_READ(ctx);
14386 ctx->fmtpos++;
14387 if (arg->ch < '0' || arg->ch > '9')
14388 break;
14389 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
14390 PyErr_SetString(PyExc_ValueError,
Victor Stinner3921e902012-10-06 23:05:00 +020014391 "precision too big");
Victor Stinnera47082312012-10-04 02:19:54 +020014392 return -1;
14393 }
14394 arg->prec = arg->prec*10 + (arg->ch - '0');
14395 }
14396 }
14397 }
14398
14399 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
14400 if (ctx->fmtcnt >= 0) {
14401 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
14402 if (--ctx->fmtcnt >= 0) {
14403 arg->ch = FORMAT_READ(ctx);
14404 ctx->fmtpos++;
14405 }
14406 }
14407 }
14408 if (ctx->fmtcnt < 0) {
14409 PyErr_SetString(PyExc_ValueError,
14410 "incomplete format");
14411 return -1;
14412 }
14413 return 0;
14414
14415#undef FORMAT_READ
14416}
14417
14418/* Format one argument. Supported conversion specifiers:
14419
14420 - "s", "r", "a": any type
Ethan Furmandf3ed242014-01-05 06:50:30 -080014421 - "i", "d", "u": int or float
14422 - "o", "x", "X": int
Victor Stinnera47082312012-10-04 02:19:54 +020014423 - "e", "E", "f", "F", "g", "G": float
14424 - "c": int or str (1 character)
14425
Victor Stinner8dbd4212012-12-04 09:30:24 +010014426 When possible, the output is written directly into the Unicode writer
14427 (ctx->writer). A string is created when padding is required.
14428
Victor Stinnera47082312012-10-04 02:19:54 +020014429 Return 0 if the argument has been formatted into *p_str,
14430 1 if the argument has been written into ctx->writer,
Victor Stinner8dbd4212012-12-04 09:30:24 +010014431 -1 on error. */
Victor Stinnera47082312012-10-04 02:19:54 +020014432static int
14433unicode_format_arg_format(struct unicode_formatter_t *ctx,
14434 struct unicode_format_arg_t *arg,
14435 PyObject **p_str)
14436{
14437 PyObject *v;
14438 _PyUnicodeWriter *writer = &ctx->writer;
14439
14440 if (ctx->fmtcnt == 0)
14441 ctx->writer.overallocate = 0;
14442
14443 if (arg->ch == '%') {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014444 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014445 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014446 return 1;
14447 }
14448
14449 v = unicode_format_getnextarg(ctx);
14450 if (v == NULL)
14451 return -1;
14452
Victor Stinnera47082312012-10-04 02:19:54 +020014453
14454 switch (arg->ch) {
Victor Stinnera47082312012-10-04 02:19:54 +020014455 case 's':
14456 case 'r':
14457 case 'a':
14458 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
14459 /* Fast path */
14460 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
14461 return -1;
14462 return 1;
14463 }
14464
14465 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
14466 *p_str = v;
14467 Py_INCREF(*p_str);
14468 }
14469 else {
14470 if (arg->ch == 's')
14471 *p_str = PyObject_Str(v);
14472 else if (arg->ch == 'r')
14473 *p_str = PyObject_Repr(v);
14474 else
14475 *p_str = PyObject_ASCII(v);
14476 }
14477 break;
14478
14479 case 'i':
14480 case 'd':
14481 case 'u':
14482 case 'o':
14483 case 'x':
14484 case 'X':
14485 {
14486 int ret = mainformatlong(v, arg, p_str, writer);
14487 if (ret != 0)
14488 return ret;
14489 arg->sign = 1;
14490 break;
14491 }
14492
14493 case 'e':
14494 case 'E':
14495 case 'f':
14496 case 'F':
14497 case 'g':
14498 case 'G':
14499 if (arg->width == -1 && arg->prec == -1
14500 && !(arg->flags & (F_SIGN | F_BLANK)))
14501 {
14502 /* Fast path */
14503 if (formatfloat(v, arg, NULL, writer) == -1)
14504 return -1;
14505 return 1;
14506 }
14507
14508 arg->sign = 1;
14509 if (formatfloat(v, arg, p_str, NULL) == -1)
14510 return -1;
14511 break;
14512
14513 case 'c':
14514 {
14515 Py_UCS4 ch = formatchar(v);
14516 if (ch == (Py_UCS4) -1)
14517 return -1;
14518 if (arg->width == -1 && arg->prec == -1) {
14519 /* Fast path */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014520 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014521 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014522 return 1;
14523 }
14524 *p_str = PyUnicode_FromOrdinal(ch);
14525 break;
14526 }
14527
14528 default:
14529 PyErr_Format(PyExc_ValueError,
14530 "unsupported format character '%c' (0x%x) "
Victor Stinnera33bce02014-07-04 22:47:46 +020014531 "at index %zd",
Victor Stinnera47082312012-10-04 02:19:54 +020014532 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
14533 (int)arg->ch,
14534 ctx->fmtpos - 1);
14535 return -1;
14536 }
14537 if (*p_str == NULL)
14538 return -1;
14539 assert (PyUnicode_Check(*p_str));
14540 return 0;
14541}
14542
14543static int
14544unicode_format_arg_output(struct unicode_formatter_t *ctx,
14545 struct unicode_format_arg_t *arg,
14546 PyObject *str)
14547{
14548 Py_ssize_t len;
14549 enum PyUnicode_Kind kind;
14550 void *pbuf;
14551 Py_ssize_t pindex;
14552 Py_UCS4 signchar;
14553 Py_ssize_t buflen;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014554 Py_UCS4 maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014555 Py_ssize_t sublen;
14556 _PyUnicodeWriter *writer = &ctx->writer;
14557 Py_UCS4 fill;
14558
14559 fill = ' ';
14560 if (arg->sign && arg->flags & F_ZERO)
14561 fill = '0';
14562
14563 if (PyUnicode_READY(str) == -1)
14564 return -1;
14565
14566 len = PyUnicode_GET_LENGTH(str);
14567 if ((arg->width == -1 || arg->width <= len)
14568 && (arg->prec == -1 || arg->prec >= len)
14569 && !(arg->flags & (F_SIGN | F_BLANK)))
14570 {
14571 /* Fast path */
14572 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
14573 return -1;
14574 return 0;
14575 }
14576
14577 /* Truncate the string for "s", "r" and "a" formats
14578 if the precision is set */
14579 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
14580 if (arg->prec >= 0 && len > arg->prec)
14581 len = arg->prec;
14582 }
14583
14584 /* Adjust sign and width */
14585 kind = PyUnicode_KIND(str);
14586 pbuf = PyUnicode_DATA(str);
14587 pindex = 0;
14588 signchar = '\0';
14589 if (arg->sign) {
14590 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
14591 if (ch == '-' || ch == '+') {
14592 signchar = ch;
14593 len--;
14594 pindex++;
14595 }
14596 else if (arg->flags & F_SIGN)
14597 signchar = '+';
14598 else if (arg->flags & F_BLANK)
14599 signchar = ' ';
14600 else
14601 arg->sign = 0;
14602 }
14603 if (arg->width < len)
14604 arg->width = len;
14605
14606 /* Prepare the writer */
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014607 maxchar = writer->maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014608 if (!(arg->flags & F_LJUST)) {
14609 if (arg->sign) {
14610 if ((arg->width-1) > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014611 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014612 }
14613 else {
14614 if (arg->width > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014615 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014616 }
14617 }
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014618 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
14619 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014620 maxchar = Py_MAX(maxchar, strmaxchar);
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014621 }
14622
Victor Stinnera47082312012-10-04 02:19:54 +020014623 buflen = arg->width;
14624 if (arg->sign && len == arg->width)
14625 buflen++;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014626 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
Victor Stinnera47082312012-10-04 02:19:54 +020014627 return -1;
14628
14629 /* Write the sign if needed */
14630 if (arg->sign) {
14631 if (fill != ' ') {
14632 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14633 writer->pos += 1;
14634 }
14635 if (arg->width > len)
14636 arg->width--;
14637 }
14638
14639 /* Write the numeric prefix for "x", "X" and "o" formats
14640 if the alternate form is used.
14641 For example, write "0x" for the "%#x" format. */
14642 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14643 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14644 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
14645 if (fill != ' ') {
14646 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14647 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14648 writer->pos += 2;
14649 pindex += 2;
14650 }
14651 arg->width -= 2;
14652 if (arg->width < 0)
14653 arg->width = 0;
14654 len -= 2;
14655 }
14656
14657 /* Pad left with the fill character if needed */
14658 if (arg->width > len && !(arg->flags & F_LJUST)) {
14659 sublen = arg->width - len;
14660 FILL(writer->kind, writer->data, fill, writer->pos, sublen);
14661 writer->pos += sublen;
14662 arg->width = len;
14663 }
14664
14665 /* If padding with spaces: write sign if needed and/or numeric prefix if
14666 the alternate form is used */
14667 if (fill == ' ') {
14668 if (arg->sign) {
14669 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14670 writer->pos += 1;
14671 }
14672 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14673 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14674 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
14675 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14676 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14677 writer->pos += 2;
14678 pindex += 2;
14679 }
14680 }
14681
14682 /* Write characters */
14683 if (len) {
14684 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14685 str, pindex, len);
14686 writer->pos += len;
14687 }
14688
14689 /* Pad right with the fill character if needed */
14690 if (arg->width > len) {
14691 sublen = arg->width - len;
14692 FILL(writer->kind, writer->data, ' ', writer->pos, sublen);
14693 writer->pos += sublen;
14694 }
14695 return 0;
14696}
14697
14698/* Helper of PyUnicode_Format(): format one arg.
14699 Return 0 on success, raise an exception and return -1 on error. */
14700static int
14701unicode_format_arg(struct unicode_formatter_t *ctx)
14702{
14703 struct unicode_format_arg_t arg;
14704 PyObject *str;
14705 int ret;
14706
Victor Stinner8dbd4212012-12-04 09:30:24 +010014707 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
14708 arg.flags = 0;
14709 arg.width = -1;
14710 arg.prec = -1;
14711 arg.sign = 0;
14712 str = NULL;
14713
Victor Stinnera47082312012-10-04 02:19:54 +020014714 ret = unicode_format_arg_parse(ctx, &arg);
14715 if (ret == -1)
14716 return -1;
14717
14718 ret = unicode_format_arg_format(ctx, &arg, &str);
14719 if (ret == -1)
14720 return -1;
14721
14722 if (ret != 1) {
14723 ret = unicode_format_arg_output(ctx, &arg, str);
14724 Py_DECREF(str);
14725 if (ret == -1)
14726 return -1;
14727 }
14728
14729 if (ctx->dict && (ctx->argidx < ctx->arglen) && arg.ch != '%') {
14730 PyErr_SetString(PyExc_TypeError,
14731 "not all arguments converted during string formatting");
14732 return -1;
14733 }
14734 return 0;
14735}
14736
Alexander Belopolsky40018472011-02-26 01:02:56 +000014737PyObject *
14738PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014739{
Victor Stinnera47082312012-10-04 02:19:54 +020014740 struct unicode_formatter_t ctx;
Tim Petersced69f82003-09-16 20:30:58 +000014741
Guido van Rossumd57fd912000-03-10 22:53:23 +000014742 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014743 PyErr_BadInternalCall();
14744 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014745 }
Victor Stinnera47082312012-10-04 02:19:54 +020014746
14747 ctx.fmtstr = PyUnicode_FromObject(format);
14748 if (ctx.fmtstr == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000014749 return NULL;
Victor Stinnera47082312012-10-04 02:19:54 +020014750 if (PyUnicode_READY(ctx.fmtstr) == -1) {
14751 Py_DECREF(ctx.fmtstr);
14752 return NULL;
14753 }
14754 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
14755 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
14756 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
14757 ctx.fmtpos = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014758
Victor Stinner8f674cc2013-04-17 23:02:17 +020014759 _PyUnicodeWriter_Init(&ctx.writer);
14760 ctx.writer.min_length = ctx.fmtcnt + 100;
14761 ctx.writer.overallocate = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014762
Guido van Rossumd57fd912000-03-10 22:53:23 +000014763 if (PyTuple_Check(args)) {
Victor Stinnera47082312012-10-04 02:19:54 +020014764 ctx.arglen = PyTuple_Size(args);
14765 ctx.argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014766 }
14767 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014768 ctx.arglen = -1;
14769 ctx.argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014770 }
Victor Stinnera47082312012-10-04 02:19:54 +020014771 ctx.args_owned = 0;
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040014772 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Victor Stinnera47082312012-10-04 02:19:54 +020014773 ctx.dict = args;
14774 else
14775 ctx.dict = NULL;
14776 ctx.args = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014777
Victor Stinnera47082312012-10-04 02:19:54 +020014778 while (--ctx.fmtcnt >= 0) {
14779 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
Victor Stinnercfc4c132013-04-03 01:48:39 +020014780 Py_ssize_t nonfmtpos;
Victor Stinnera47082312012-10-04 02:19:54 +020014781
14782 nonfmtpos = ctx.fmtpos++;
14783 while (ctx.fmtcnt >= 0 &&
14784 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
14785 ctx.fmtpos++;
14786 ctx.fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014787 }
Victor Stinnera47082312012-10-04 02:19:54 +020014788 if (ctx.fmtcnt < 0) {
14789 ctx.fmtpos--;
14790 ctx.writer.overallocate = 0;
Victor Stinnera0494432012-10-03 23:03:46 +020014791 }
Victor Stinneree4544c2012-05-09 22:24:08 +020014792
Victor Stinnercfc4c132013-04-03 01:48:39 +020014793 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
14794 nonfmtpos, ctx.fmtpos) < 0)
14795 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014796 }
14797 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014798 ctx.fmtpos++;
14799 if (unicode_format_arg(&ctx) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000014800 goto onError;
Victor Stinnera47082312012-10-04 02:19:54 +020014801 }
14802 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020014803
Victor Stinnera47082312012-10-04 02:19:54 +020014804 if (ctx.argidx < ctx.arglen && !ctx.dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014805 PyErr_SetString(PyExc_TypeError,
14806 "not all arguments converted during string formatting");
14807 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014808 }
14809
Victor Stinnera47082312012-10-04 02:19:54 +020014810 if (ctx.args_owned) {
14811 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014812 }
Victor Stinnera47082312012-10-04 02:19:54 +020014813 Py_DECREF(ctx.fmtstr);
14814 return _PyUnicodeWriter_Finish(&ctx.writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014815
Benjamin Peterson29060642009-01-31 22:14:21 +000014816 onError:
Victor Stinnera47082312012-10-04 02:19:54 +020014817 Py_DECREF(ctx.fmtstr);
14818 _PyUnicodeWriter_Dealloc(&ctx.writer);
14819 if (ctx.args_owned) {
14820 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014821 }
14822 return NULL;
14823}
14824
Jeremy Hylton938ace62002-07-17 16:30:39 +000014825static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000014826unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
14827
Tim Peters6d6c1a32001-08-02 04:15:00 +000014828static PyObject *
14829unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14830{
Benjamin Peterson29060642009-01-31 22:14:21 +000014831 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014832 static char *kwlist[] = {"object", "encoding", "errors", 0};
14833 char *encoding = NULL;
14834 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000014835
Benjamin Peterson14339b62009-01-31 16:36:08 +000014836 if (type != &PyUnicode_Type)
14837 return unicode_subtype_new(type, args, kwds);
14838 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000014839 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000014840 return NULL;
14841 if (x == NULL)
Serhiy Storchaka678db842013-01-26 12:16:36 +020014842 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson14339b62009-01-31 16:36:08 +000014843 if (encoding == NULL && errors == NULL)
14844 return PyObject_Str(x);
14845 else
Benjamin Peterson29060642009-01-31 22:14:21 +000014846 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000014847}
14848
Guido van Rossume023fe02001-08-30 03:12:59 +000014849static PyObject *
14850unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14851{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014852 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014853 Py_ssize_t length, char_size;
14854 int share_wstr, share_utf8;
14855 unsigned int kind;
14856 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000014857
Benjamin Peterson14339b62009-01-31 16:36:08 +000014858 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014859
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014860 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014861 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014862 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014863 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050014864 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060014865 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014866 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060014867 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014868
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014869 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014870 if (self == NULL) {
14871 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014872 return NULL;
14873 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014874 kind = PyUnicode_KIND(unicode);
14875 length = PyUnicode_GET_LENGTH(unicode);
14876
14877 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014878#ifdef Py_DEBUG
14879 _PyUnicode_HASH(self) = -1;
14880#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014881 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014882#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014883 _PyUnicode_STATE(self).interned = 0;
14884 _PyUnicode_STATE(self).kind = kind;
14885 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020014886 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014887 _PyUnicode_STATE(self).ready = 1;
14888 _PyUnicode_WSTR(self) = NULL;
14889 _PyUnicode_UTF8_LENGTH(self) = 0;
14890 _PyUnicode_UTF8(self) = NULL;
14891 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020014892 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014893
14894 share_utf8 = 0;
14895 share_wstr = 0;
14896 if (kind == PyUnicode_1BYTE_KIND) {
14897 char_size = 1;
14898 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
14899 share_utf8 = 1;
14900 }
14901 else if (kind == PyUnicode_2BYTE_KIND) {
14902 char_size = 2;
14903 if (sizeof(wchar_t) == 2)
14904 share_wstr = 1;
14905 }
14906 else {
14907 assert(kind == PyUnicode_4BYTE_KIND);
14908 char_size = 4;
14909 if (sizeof(wchar_t) == 4)
14910 share_wstr = 1;
14911 }
14912
14913 /* Ensure we won't overflow the length. */
14914 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
14915 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014916 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014917 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014918 data = PyObject_MALLOC((length + 1) * char_size);
14919 if (data == NULL) {
14920 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014921 goto onError;
14922 }
14923
Victor Stinnerc3c74152011-10-02 20:39:55 +020014924 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014925 if (share_utf8) {
14926 _PyUnicode_UTF8_LENGTH(self) = length;
14927 _PyUnicode_UTF8(self) = data;
14928 }
14929 if (share_wstr) {
14930 _PyUnicode_WSTR_LENGTH(self) = length;
14931 _PyUnicode_WSTR(self) = (wchar_t *)data;
14932 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014933
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014934 Py_MEMCPY(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020014935 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014936 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014937#ifdef Py_DEBUG
14938 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
14939#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020014940 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010014941 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014942
14943onError:
14944 Py_DECREF(unicode);
14945 Py_DECREF(self);
14946 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000014947}
14948
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000014949PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070014950"str(object='') -> str\n\
14951str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000014952\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100014953Create a new string object from the given object. If encoding or\n\
14954errors is specified, then the object must expose a data buffer\n\
14955that will be decoded using the given encoding and error handler.\n\
14956Otherwise, returns the result of object.__str__() (if defined)\n\
14957or repr(object).\n\
14958encoding defaults to sys.getdefaultencoding().\n\
14959errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000014960
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014961static PyObject *unicode_iter(PyObject *seq);
14962
Guido van Rossumd57fd912000-03-10 22:53:23 +000014963PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000014964 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014965 "str", /* tp_name */
14966 sizeof(PyUnicodeObject), /* tp_size */
14967 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014968 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014969 (destructor)unicode_dealloc, /* tp_dealloc */
14970 0, /* tp_print */
14971 0, /* tp_getattr */
14972 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014973 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014974 unicode_repr, /* tp_repr */
14975 &unicode_as_number, /* tp_as_number */
14976 &unicode_as_sequence, /* tp_as_sequence */
14977 &unicode_as_mapping, /* tp_as_mapping */
14978 (hashfunc) unicode_hash, /* tp_hash*/
14979 0, /* tp_call*/
14980 (reprfunc) unicode_str, /* tp_str */
14981 PyObject_GenericGetAttr, /* tp_getattro */
14982 0, /* tp_setattro */
14983 0, /* tp_as_buffer */
14984 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000014985 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014986 unicode_doc, /* tp_doc */
14987 0, /* tp_traverse */
14988 0, /* tp_clear */
14989 PyUnicode_RichCompare, /* tp_richcompare */
14990 0, /* tp_weaklistoffset */
14991 unicode_iter, /* tp_iter */
14992 0, /* tp_iternext */
14993 unicode_methods, /* tp_methods */
14994 0, /* tp_members */
14995 0, /* tp_getset */
14996 &PyBaseObject_Type, /* tp_base */
14997 0, /* tp_dict */
14998 0, /* tp_descr_get */
14999 0, /* tp_descr_set */
15000 0, /* tp_dictoffset */
15001 0, /* tp_init */
15002 0, /* tp_alloc */
15003 unicode_new, /* tp_new */
15004 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015005};
15006
15007/* Initialize the Unicode implementation */
15008
Victor Stinner3a50e702011-10-18 21:21:00 +020015009int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015010{
Thomas Wouters477c8d52006-05-27 19:21:47 +000015011 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015012 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000015013 0x000A, /* LINE FEED */
15014 0x000D, /* CARRIAGE RETURN */
15015 0x001C, /* FILE SEPARATOR */
15016 0x001D, /* GROUP SEPARATOR */
15017 0x001E, /* RECORD SEPARATOR */
15018 0x0085, /* NEXT LINE */
15019 0x2028, /* LINE SEPARATOR */
15020 0x2029, /* PARAGRAPH SEPARATOR */
15021 };
15022
Fred Drakee4315f52000-05-09 19:53:39 +000015023 /* Init the implementation */
Serhiy Storchaka678db842013-01-26 12:16:36 +020015024 _Py_INCREF_UNICODE_EMPTY();
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015025 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015026 Py_FatalError("Can't create empty string");
Serhiy Storchaka678db842013-01-26 12:16:36 +020015027 Py_DECREF(unicode_empty);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015028
Guido van Rossumcacfc072002-05-24 19:01:59 +000015029 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000015030 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000015031
15032 /* initialize the linebreak bloom filter */
15033 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015034 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020015035 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015036
Christian Heimes26532f72013-07-20 14:57:16 +020015037 if (PyType_Ready(&EncodingMapType) < 0)
15038 Py_FatalError("Can't initialize encoding map type");
Victor Stinner3a50e702011-10-18 21:21:00 +020015039
Benjamin Petersonc4311282012-10-30 23:21:10 -040015040 if (PyType_Ready(&PyFieldNameIter_Type) < 0)
15041 Py_FatalError("Can't initialize field name iterator type");
15042
15043 if (PyType_Ready(&PyFormatterIter_Type) < 0)
15044 Py_FatalError("Can't initialize formatter iter type");
Benjamin Petersone8ea97f2012-10-30 23:27:52 -040015045
Victor Stinner3a50e702011-10-18 21:21:00 +020015046 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015047}
15048
15049/* Finalize the Unicode implementation */
15050
Christian Heimesa156e092008-02-16 07:38:31 +000015051int
15052PyUnicode_ClearFreeList(void)
15053{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015054 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000015055}
15056
Guido van Rossumd57fd912000-03-10 22:53:23 +000015057void
Thomas Wouters78890102000-07-22 19:25:51 +000015058_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015059{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000015060 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015061
Serhiy Storchaka05997252013-01-26 12:14:02 +020015062 Py_CLEAR(unicode_empty);
Barry Warsaw5b4c2282000-10-03 20:45:26 +000015063
Serhiy Storchaka05997252013-01-26 12:14:02 +020015064 for (i = 0; i < 256; i++)
15065 Py_CLEAR(unicode_latin1[i]);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020015066 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000015067 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000015068}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000015069
Walter Dörwald16807132007-05-25 13:52:07 +000015070void
15071PyUnicode_InternInPlace(PyObject **p)
15072{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015073 PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015074 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020015075#ifdef Py_DEBUG
15076 assert(s != NULL);
15077 assert(_PyUnicode_CHECK(s));
15078#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000015079 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020015080 return;
15081#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015082 /* If it's a subclass, we don't really know what putting
15083 it in the interned dict might do. */
15084 if (!PyUnicode_CheckExact(s))
15085 return;
15086 if (PyUnicode_CHECK_INTERNED(s))
15087 return;
15088 if (interned == NULL) {
15089 interned = PyDict_New();
15090 if (interned == NULL) {
15091 PyErr_Clear(); /* Don't leave an exception */
15092 return;
15093 }
15094 }
15095 /* It might be that the GetItem call fails even
15096 though the key is present in the dictionary,
15097 namely when this happens during a stack overflow. */
15098 Py_ALLOW_RECURSION
Victor Stinner7931d9a2011-11-04 00:22:48 +010015099 t = PyDict_GetItem(interned, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015100 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000015101
Victor Stinnerf0335102013-04-14 19:13:03 +020015102 if (t) {
15103 Py_INCREF(t);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030015104 Py_SETREF(*p, t);
Victor Stinnerf0335102013-04-14 19:13:03 +020015105 return;
15106 }
Walter Dörwald16807132007-05-25 13:52:07 +000015107
Benjamin Peterson14339b62009-01-31 16:36:08 +000015108 PyThreadState_GET()->recursion_critical = 1;
Victor Stinner7931d9a2011-11-04 00:22:48 +010015109 if (PyDict_SetItem(interned, s, s) < 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015110 PyErr_Clear();
15111 PyThreadState_GET()->recursion_critical = 0;
15112 return;
15113 }
15114 PyThreadState_GET()->recursion_critical = 0;
15115 /* The two references in interned are not counted by refcnt.
15116 The deallocator will take care of this */
15117 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015118 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000015119}
15120
15121void
15122PyUnicode_InternImmortal(PyObject **p)
15123{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015124 PyUnicode_InternInPlace(p);
15125 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020015126 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015127 Py_INCREF(*p);
15128 }
Walter Dörwald16807132007-05-25 13:52:07 +000015129}
15130
15131PyObject *
15132PyUnicode_InternFromString(const char *cp)
15133{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015134 PyObject *s = PyUnicode_FromString(cp);
15135 if (s == NULL)
15136 return NULL;
15137 PyUnicode_InternInPlace(&s);
15138 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000015139}
15140
Alexander Belopolsky40018472011-02-26 01:02:56 +000015141void
15142_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000015143{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015144 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015145 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015146 Py_ssize_t i, n;
15147 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000015148
Benjamin Peterson14339b62009-01-31 16:36:08 +000015149 if (interned == NULL || !PyDict_Check(interned))
15150 return;
15151 keys = PyDict_Keys(interned);
15152 if (keys == NULL || !PyList_Check(keys)) {
15153 PyErr_Clear();
15154 return;
15155 }
Walter Dörwald16807132007-05-25 13:52:07 +000015156
Benjamin Peterson14339b62009-01-31 16:36:08 +000015157 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
15158 detector, interned unicode strings are not forcibly deallocated;
15159 rather, we give them their stolen references back, and then clear
15160 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000015161
Benjamin Peterson14339b62009-01-31 16:36:08 +000015162 n = PyList_GET_SIZE(keys);
15163 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000015164 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015165 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015166 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015167 if (PyUnicode_READY(s) == -1) {
15168 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015169 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015170 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015171 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015172 case SSTATE_NOT_INTERNED:
15173 /* XXX Shouldn't happen */
15174 break;
15175 case SSTATE_INTERNED_IMMORTAL:
15176 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015177 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015178 break;
15179 case SSTATE_INTERNED_MORTAL:
15180 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015181 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015182 break;
15183 default:
15184 Py_FatalError("Inconsistent interned string state.");
15185 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015186 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015187 }
15188 fprintf(stderr, "total size of all interned strings: "
15189 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
15190 "mortal/immortal\n", mortal_size, immortal_size);
15191 Py_DECREF(keys);
15192 PyDict_Clear(interned);
Serhiy Storchaka05997252013-01-26 12:14:02 +020015193 Py_CLEAR(interned);
Walter Dörwald16807132007-05-25 13:52:07 +000015194}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015195
15196
15197/********************* Unicode Iterator **************************/
15198
15199typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015200 PyObject_HEAD
15201 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015202 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015203} unicodeiterobject;
15204
15205static void
15206unicodeiter_dealloc(unicodeiterobject *it)
15207{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015208 _PyObject_GC_UNTRACK(it);
15209 Py_XDECREF(it->it_seq);
15210 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015211}
15212
15213static int
15214unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15215{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015216 Py_VISIT(it->it_seq);
15217 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015218}
15219
15220static PyObject *
15221unicodeiter_next(unicodeiterobject *it)
15222{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015223 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015224
Benjamin Peterson14339b62009-01-31 16:36:08 +000015225 assert(it != NULL);
15226 seq = it->it_seq;
15227 if (seq == NULL)
15228 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015229 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015230
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015231 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15232 int kind = PyUnicode_KIND(seq);
15233 void *data = PyUnicode_DATA(seq);
15234 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15235 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015236 if (item != NULL)
15237 ++it->it_index;
15238 return item;
15239 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015240
Benjamin Peterson14339b62009-01-31 16:36:08 +000015241 it->it_seq = NULL;
Serhiy Storchakafbb1c5e2016-03-30 20:40:02 +030015242 Py_DECREF(seq);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015243 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015244}
15245
15246static PyObject *
15247unicodeiter_len(unicodeiterobject *it)
15248{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015249 Py_ssize_t len = 0;
15250 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020015251 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015252 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015253}
15254
15255PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15256
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015257static PyObject *
15258unicodeiter_reduce(unicodeiterobject *it)
15259{
15260 if (it->it_seq != NULL) {
Antoine Pitroua7013882012-04-05 00:04:20 +020015261 return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015262 it->it_seq, it->it_index);
15263 } else {
15264 PyObject *u = PyUnicode_FromUnicode(NULL, 0);
15265 if (u == NULL)
15266 return NULL;
Antoine Pitroua7013882012-04-05 00:04:20 +020015267 return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015268 }
15269}
15270
15271PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15272
15273static PyObject *
15274unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15275{
15276 Py_ssize_t index = PyLong_AsSsize_t(state);
15277 if (index == -1 && PyErr_Occurred())
15278 return NULL;
Kristján Valur Jónsson25dded02014-03-05 13:47:57 +000015279 if (it->it_seq != NULL) {
15280 if (index < 0)
15281 index = 0;
15282 else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15283 index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15284 it->it_index = index;
15285 }
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015286 Py_RETURN_NONE;
15287}
15288
15289PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15290
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015291static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015292 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000015293 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015294 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15295 reduce_doc},
15296 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
15297 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000015298 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015299};
15300
15301PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015302 PyVarObject_HEAD_INIT(&PyType_Type, 0)
15303 "str_iterator", /* tp_name */
15304 sizeof(unicodeiterobject), /* tp_basicsize */
15305 0, /* tp_itemsize */
15306 /* methods */
15307 (destructor)unicodeiter_dealloc, /* tp_dealloc */
15308 0, /* tp_print */
15309 0, /* tp_getattr */
15310 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000015311 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015312 0, /* tp_repr */
15313 0, /* tp_as_number */
15314 0, /* tp_as_sequence */
15315 0, /* tp_as_mapping */
15316 0, /* tp_hash */
15317 0, /* tp_call */
15318 0, /* tp_str */
15319 PyObject_GenericGetAttr, /* tp_getattro */
15320 0, /* tp_setattro */
15321 0, /* tp_as_buffer */
15322 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15323 0, /* tp_doc */
15324 (traverseproc)unicodeiter_traverse, /* tp_traverse */
15325 0, /* tp_clear */
15326 0, /* tp_richcompare */
15327 0, /* tp_weaklistoffset */
15328 PyObject_SelfIter, /* tp_iter */
15329 (iternextfunc)unicodeiter_next, /* tp_iternext */
15330 unicodeiter_methods, /* tp_methods */
15331 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015332};
15333
15334static PyObject *
15335unicode_iter(PyObject *seq)
15336{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015337 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015338
Benjamin Peterson14339b62009-01-31 16:36:08 +000015339 if (!PyUnicode_Check(seq)) {
15340 PyErr_BadInternalCall();
15341 return NULL;
15342 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015343 if (PyUnicode_READY(seq) == -1)
15344 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015345 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15346 if (it == NULL)
15347 return NULL;
15348 it->it_index = 0;
15349 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015350 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015351 _PyObject_GC_TRACK(it);
15352 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015353}
15354
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015355
15356size_t
15357Py_UNICODE_strlen(const Py_UNICODE *u)
15358{
15359 int res = 0;
15360 while(*u++)
15361 res++;
15362 return res;
15363}
15364
15365Py_UNICODE*
15366Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
15367{
15368 Py_UNICODE *u = s1;
15369 while ((*u++ = *s2++));
15370 return s1;
15371}
15372
15373Py_UNICODE*
15374Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15375{
15376 Py_UNICODE *u = s1;
15377 while ((*u++ = *s2++))
15378 if (n-- == 0)
15379 break;
15380 return s1;
15381}
15382
15383Py_UNICODE*
15384Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
15385{
15386 Py_UNICODE *u1 = s1;
15387 u1 += Py_UNICODE_strlen(u1);
15388 Py_UNICODE_strcpy(u1, s2);
15389 return s1;
15390}
15391
15392int
15393Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
15394{
15395 while (*s1 && *s2 && *s1 == *s2)
15396 s1++, s2++;
15397 if (*s1 && *s2)
15398 return (*s1 < *s2) ? -1 : +1;
15399 if (*s1)
15400 return 1;
15401 if (*s2)
15402 return -1;
15403 return 0;
15404}
15405
15406int
15407Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15408{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015409 Py_UNICODE u1, u2;
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015410 for (; n != 0; n--) {
15411 u1 = *s1;
15412 u2 = *s2;
15413 if (u1 != u2)
15414 return (u1 < u2) ? -1 : +1;
15415 if (u1 == '\0')
15416 return 0;
15417 s1++;
15418 s2++;
15419 }
15420 return 0;
15421}
15422
15423Py_UNICODE*
15424Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
15425{
15426 const Py_UNICODE *p;
15427 for (p = s; *p; p++)
15428 if (*p == c)
15429 return (Py_UNICODE*)p;
15430 return NULL;
15431}
15432
15433Py_UNICODE*
15434Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
15435{
15436 const Py_UNICODE *p;
15437 p = s + Py_UNICODE_strlen(s);
15438 while (p != s) {
15439 p--;
15440 if (*p == c)
15441 return (Py_UNICODE*)p;
15442 }
15443 return NULL;
15444}
Victor Stinner331ea922010-08-10 16:37:20 +000015445
Victor Stinner71133ff2010-09-01 23:43:53 +000015446Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015447PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000015448{
Victor Stinner577db2c2011-10-11 22:12:48 +020015449 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015450 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000015451
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015452 if (!PyUnicode_Check(unicode)) {
15453 PyErr_BadArgument();
15454 return NULL;
15455 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015456 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020015457 if (u == NULL)
15458 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000015459 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -070015460 if (len > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000015461 PyErr_NoMemory();
15462 return NULL;
15463 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015464 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000015465 size *= sizeof(Py_UNICODE);
15466 copy = PyMem_Malloc(size);
15467 if (copy == NULL) {
15468 PyErr_NoMemory();
15469 return NULL;
15470 }
Victor Stinner577db2c2011-10-11 22:12:48 +020015471 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000015472 return copy;
15473}
Martin v. Löwis5b222132007-06-10 09:51:05 +000015474
Georg Brandl66c221e2010-10-14 07:04:07 +000015475/* A _string module, to export formatter_parser and formatter_field_name_split
15476 to the string.Formatter class implemented in Python. */
15477
15478static PyMethodDef _string_methods[] = {
15479 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
15480 METH_O, PyDoc_STR("split the argument as a field name")},
15481 {"formatter_parser", (PyCFunction) formatter_parser,
15482 METH_O, PyDoc_STR("parse the argument as a format string")},
15483 {NULL, NULL}
15484};
15485
15486static struct PyModuleDef _string_module = {
15487 PyModuleDef_HEAD_INIT,
15488 "_string",
15489 PyDoc_STR("string helper module"),
15490 0,
15491 _string_methods,
15492 NULL,
15493 NULL,
15494 NULL,
15495 NULL
15496};
15497
15498PyMODINIT_FUNC
15499PyInit__string(void)
15500{
15501 return PyModule_Create(&_string_module);
15502}
15503
15504
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000015505#ifdef __cplusplus
15506}
15507#endif