blob: 19549cde5722b87b84c525e1dca0c8e20fea6437 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000046#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000047#include <windows.h>
48#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000049
Larry Hastings61272b72014-01-07 12:41:53 -080050/*[clinic input]
Larry Hastingsc2047262014-01-25 20:43:29 -080051class str "PyUnicodeObject *" "&PyUnicode_Type"
Larry Hastings61272b72014-01-07 12:41:53 -080052[clinic start generated code]*/
Larry Hastings581ee362014-01-28 05:00:08 -080053/*[clinic end generated code: output=da39a3ee5e6b4b0d input=604e916854800fa8]*/
Larry Hastings44e2eaa2013-11-23 15:37:55 -080054
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000055/* --- Globals ------------------------------------------------------------
56
Serhiy Storchaka05997252013-01-26 12:14:02 +020057NOTE: In the interpreter's initialization phase, some globals are currently
58 initialized dynamically as needed. In the process Unicode objects may
59 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000060
61*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000062
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000063
64#ifdef __cplusplus
65extern "C" {
66#endif
67
Victor Stinner8faf8212011-12-08 22:14:11 +010068/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
69#define MAX_UNICODE 0x10ffff
70
Victor Stinner910337b2011-10-03 03:20:16 +020071#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020072# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020073#else
74# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
75#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020076
Victor Stinnere90fe6a2011-10-01 16:48:13 +020077#define _PyUnicode_UTF8(op) \
78 (((PyCompactUnicodeObject*)(op))->utf8)
79#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020080 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020081 assert(PyUnicode_IS_READY(op)), \
82 PyUnicode_IS_COMPACT_ASCII(op) ? \
83 ((char*)((PyASCIIObject*)(op) + 1)) : \
84 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +020085#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020086 (((PyCompactUnicodeObject*)(op))->utf8_length)
87#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020088 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020089 assert(PyUnicode_IS_READY(op)), \
90 PyUnicode_IS_COMPACT_ASCII(op) ? \
91 ((PyASCIIObject*)(op))->length : \
92 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +020093#define _PyUnicode_WSTR(op) \
94 (((PyASCIIObject*)(op))->wstr)
95#define _PyUnicode_WSTR_LENGTH(op) \
96 (((PyCompactUnicodeObject*)(op))->wstr_length)
97#define _PyUnicode_LENGTH(op) \
98 (((PyASCIIObject *)(op))->length)
99#define _PyUnicode_STATE(op) \
100 (((PyASCIIObject *)(op))->state)
101#define _PyUnicode_HASH(op) \
102 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200103#define _PyUnicode_KIND(op) \
104 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200105 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200106#define _PyUnicode_GET_LENGTH(op) \
107 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200108 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200109#define _PyUnicode_DATA_ANY(op) \
110 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200111
Victor Stinner910337b2011-10-03 03:20:16 +0200112#undef PyUnicode_READY
113#define PyUnicode_READY(op) \
114 (assert(_PyUnicode_CHECK(op)), \
115 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200116 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100117 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200118
Victor Stinnerc379ead2011-10-03 12:52:27 +0200119#define _PyUnicode_SHARE_UTF8(op) \
120 (assert(_PyUnicode_CHECK(op)), \
121 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
122 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
123#define _PyUnicode_SHARE_WSTR(op) \
124 (assert(_PyUnicode_CHECK(op)), \
125 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
126
Victor Stinner829c0ad2011-10-03 01:08:02 +0200127/* true if the Unicode object has an allocated UTF-8 memory block
128 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200129#define _PyUnicode_HAS_UTF8_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200130 ((!PyUnicode_IS_COMPACT_ASCII(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200131 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200132 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
133
Victor Stinner03490912011-10-03 23:45:12 +0200134/* true if the Unicode object has an allocated wstr memory block
135 (not shared with other data) */
136#define _PyUnicode_HAS_WSTR_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200137 ((_PyUnicode_WSTR(op) && \
Victor Stinner03490912011-10-03 23:45:12 +0200138 (!PyUnicode_IS_READY(op) || \
139 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
140
Victor Stinner910337b2011-10-03 03:20:16 +0200141/* Generic helper macro to convert characters of different types.
142 from_type and to_type have to be valid type names, begin and end
143 are pointers to the source characters which should be of type
144 "from_type *". to is a pointer of type "to_type *" and points to the
145 buffer where the result characters are written to. */
146#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
147 do { \
Victor Stinner4a587072013-11-19 12:54:53 +0100148 to_type *_to = (to_type *)(to); \
149 const from_type *_iter = (from_type *)(begin); \
150 const from_type *_end = (from_type *)(end); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200151 Py_ssize_t n = (_end) - (_iter); \
152 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200153 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200154 while (_iter < (_unrolled_end)) { \
155 _to[0] = (to_type) _iter[0]; \
156 _to[1] = (to_type) _iter[1]; \
157 _to[2] = (to_type) _iter[2]; \
158 _to[3] = (to_type) _iter[3]; \
159 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200160 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200161 while (_iter < (_end)) \
162 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200163 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200164
Walter Dörwald16807132007-05-25 13:52:07 +0000165/* This dictionary holds all interned unicode strings. Note that references
166 to strings in this dictionary are *not* counted in the string's ob_refcnt.
167 When the interned string reaches a refcnt of 0 the string deallocation
168 function will delete the reference from this dictionary.
169
170 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000171 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000172*/
Serhiy Storchaka05997252013-01-26 12:14:02 +0200173static PyObject *interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +0000174
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000175/* The empty Unicode object is shared to improve performance. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200176static PyObject *unicode_empty = NULL;
Serhiy Storchaka05997252013-01-26 12:14:02 +0200177
Serhiy Storchaka678db842013-01-26 12:16:36 +0200178#define _Py_INCREF_UNICODE_EMPTY() \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200179 do { \
180 if (unicode_empty != NULL) \
181 Py_INCREF(unicode_empty); \
182 else { \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200183 unicode_empty = PyUnicode_New(0, 0); \
184 if (unicode_empty != NULL) { \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200185 Py_INCREF(unicode_empty); \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200186 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
187 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200188 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200189 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000190
Serhiy Storchaka678db842013-01-26 12:16:36 +0200191#define _Py_RETURN_UNICODE_EMPTY() \
192 do { \
193 _Py_INCREF_UNICODE_EMPTY(); \
194 return unicode_empty; \
195 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000196
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200197/* Forward declaration */
198Py_LOCAL_INLINE(int)
199_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
200
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200201/* List of static strings. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200202static _Py_Identifier *static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200203
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000204/* Single character Unicode strings in the Latin-1 range are being
205 shared as well. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200206static PyObject *unicode_latin1[256] = {NULL};
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000207
Christian Heimes190d79e2008-01-30 11:58:22 +0000208/* Fast detection of the most frequent whitespace characters */
209const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000210 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000211/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000212/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000213/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000214/* case 0x000C: * FORM FEED */
215/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000216 0, 1, 1, 1, 1, 1, 0, 0,
217 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000218/* case 0x001C: * FILE SEPARATOR */
219/* case 0x001D: * GROUP SEPARATOR */
220/* case 0x001E: * RECORD SEPARATOR */
221/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000222 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000223/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000224 1, 0, 0, 0, 0, 0, 0, 0,
225 0, 0, 0, 0, 0, 0, 0, 0,
226 0, 0, 0, 0, 0, 0, 0, 0,
227 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000228
Benjamin Peterson14339b62009-01-31 16:36:08 +0000229 0, 0, 0, 0, 0, 0, 0, 0,
230 0, 0, 0, 0, 0, 0, 0, 0,
231 0, 0, 0, 0, 0, 0, 0, 0,
232 0, 0, 0, 0, 0, 0, 0, 0,
233 0, 0, 0, 0, 0, 0, 0, 0,
234 0, 0, 0, 0, 0, 0, 0, 0,
235 0, 0, 0, 0, 0, 0, 0, 0,
236 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000237};
238
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200239/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200240static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200241static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100242static int unicode_modifiable(PyObject *unicode);
243
Victor Stinnerfe226c02011-10-03 03:52:20 +0200244
Alexander Belopolsky40018472011-02-26 01:02:56 +0000245static PyObject *
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100246_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200247static PyObject *
248_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
249static PyObject *
250_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
251
252static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000253unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000254 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100255 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000256 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
257
Alexander Belopolsky40018472011-02-26 01:02:56 +0000258static void
259raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300260 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100261 PyObject *unicode,
262 Py_ssize_t startpos, Py_ssize_t endpos,
263 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000264
Christian Heimes190d79e2008-01-30 11:58:22 +0000265/* Same for linebreaks */
266static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000267 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000268/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000269/* 0x000B, * LINE TABULATION */
270/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000271/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000272 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000273 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000274/* 0x001C, * FILE SEPARATOR */
275/* 0x001D, * GROUP SEPARATOR */
276/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000277 0, 0, 0, 0, 1, 1, 1, 0,
278 0, 0, 0, 0, 0, 0, 0, 0,
279 0, 0, 0, 0, 0, 0, 0, 0,
280 0, 0, 0, 0, 0, 0, 0, 0,
281 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000282
Benjamin Peterson14339b62009-01-31 16:36:08 +0000283 0, 0, 0, 0, 0, 0, 0, 0,
284 0, 0, 0, 0, 0, 0, 0, 0,
285 0, 0, 0, 0, 0, 0, 0, 0,
286 0, 0, 0, 0, 0, 0, 0, 0,
287 0, 0, 0, 0, 0, 0, 0, 0,
288 0, 0, 0, 0, 0, 0, 0, 0,
289 0, 0, 0, 0, 0, 0, 0, 0,
290 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000291};
292
Serhiy Storchaka1009bf12015-04-03 23:53:51 +0300293#include "clinic/unicodeobject.c.h"
294
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300295/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
296 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000297Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000298PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000299{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000300#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000301 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000302#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000303 /* This is actually an illegal character, so it should
304 not be passed to unichr. */
305 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000306#endif
307}
308
Victor Stinner910337b2011-10-03 03:20:16 +0200309#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200310int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100311_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200312{
313 PyASCIIObject *ascii;
314 unsigned int kind;
315
316 assert(PyUnicode_Check(op));
317
318 ascii = (PyASCIIObject *)op;
319 kind = ascii->state.kind;
320
Victor Stinnera3b334d2011-10-03 13:53:37 +0200321 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200322 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200323 assert(ascii->state.ready == 1);
324 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200325 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200326 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200327 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200328
Victor Stinnera41463c2011-10-04 01:05:08 +0200329 if (ascii->state.compact == 1) {
330 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200331 assert(kind == PyUnicode_1BYTE_KIND
332 || kind == PyUnicode_2BYTE_KIND
333 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200334 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200335 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200336 assert (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100337 }
338 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200339 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
340
341 data = unicode->data.any;
342 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnere30c0a12011-11-04 20:54:05 +0100343 assert(ascii->length == 0);
344 assert(ascii->hash == -1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200345 assert(ascii->state.compact == 0);
346 assert(ascii->state.ascii == 0);
347 assert(ascii->state.ready == 0);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100348 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
Victor Stinnera41463c2011-10-04 01:05:08 +0200349 assert(ascii->wstr != NULL);
350 assert(data == NULL);
351 assert(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200352 }
353 else {
354 assert(kind == PyUnicode_1BYTE_KIND
355 || kind == PyUnicode_2BYTE_KIND
356 || kind == PyUnicode_4BYTE_KIND);
357 assert(ascii->state.compact == 0);
358 assert(ascii->state.ready == 1);
359 assert(data != NULL);
360 if (ascii->state.ascii) {
361 assert (compact->utf8 == data);
362 assert (compact->utf8_length == ascii->length);
363 }
364 else
365 assert (compact->utf8 != data);
366 }
367 }
368 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200369 if (
370#if SIZEOF_WCHAR_T == 2
371 kind == PyUnicode_2BYTE_KIND
372#else
373 kind == PyUnicode_4BYTE_KIND
374#endif
375 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200376 {
377 assert(ascii->wstr == data);
378 assert(compact->wstr_length == ascii->length);
379 } else
380 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200381 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200382
383 if (compact->utf8 == NULL)
384 assert(compact->utf8_length == 0);
385 if (ascii->wstr == NULL)
386 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200387 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200388 /* check that the best kind is used */
389 if (check_content && kind != PyUnicode_WCHAR_KIND)
390 {
391 Py_ssize_t i;
392 Py_UCS4 maxchar = 0;
Victor Stinner718fbf02012-04-26 00:39:37 +0200393 void *data;
394 Py_UCS4 ch;
395
396 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200397 for (i=0; i < ascii->length; i++)
398 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200399 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200400 if (ch > maxchar)
401 maxchar = ch;
402 }
403 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100404 if (ascii->state.ascii == 0) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200405 assert(maxchar >= 128);
Victor Stinner77faf692011-11-20 18:56:05 +0100406 assert(maxchar <= 255);
407 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200408 else
409 assert(maxchar < 128);
410 }
Victor Stinner77faf692011-11-20 18:56:05 +0100411 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200412 assert(maxchar >= 0x100);
Victor Stinner77faf692011-11-20 18:56:05 +0100413 assert(maxchar <= 0xFFFF);
414 }
415 else {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200416 assert(maxchar >= 0x10000);
Victor Stinner8faf8212011-12-08 22:14:11 +0100417 assert(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100418 }
Victor Stinner718fbf02012-04-26 00:39:37 +0200419 assert(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200420 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400421 return 1;
422}
Victor Stinner910337b2011-10-03 03:20:16 +0200423#endif
424
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100425static PyObject*
426unicode_result_wchar(PyObject *unicode)
427{
428#ifndef Py_DEBUG
429 Py_ssize_t len;
430
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100431 len = _PyUnicode_WSTR_LENGTH(unicode);
432 if (len == 0) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100433 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200434 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100435 }
436
437 if (len == 1) {
438 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100439 if ((Py_UCS4)ch < 256) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100440 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
441 Py_DECREF(unicode);
442 return latin1_char;
443 }
444 }
445
446 if (_PyUnicode_Ready(unicode) < 0) {
Victor Stinneraa771272012-10-04 02:32:58 +0200447 Py_DECREF(unicode);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100448 return NULL;
449 }
450#else
Victor Stinneraa771272012-10-04 02:32:58 +0200451 assert(Py_REFCNT(unicode) == 1);
452
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100453 /* don't make the result ready in debug mode to ensure that the caller
454 makes the string ready before using it */
455 assert(_PyUnicode_CheckConsistency(unicode, 1));
456#endif
457 return unicode;
458}
459
460static PyObject*
461unicode_result_ready(PyObject *unicode)
462{
463 Py_ssize_t length;
464
465 length = PyUnicode_GET_LENGTH(unicode);
466 if (length == 0) {
467 if (unicode != unicode_empty) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100468 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200469 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100470 }
471 return unicode_empty;
472 }
473
474 if (length == 1) {
Victor Stinner69ed0f42013-04-09 21:48:24 +0200475 void *data = PyUnicode_DATA(unicode);
476 int kind = PyUnicode_KIND(unicode);
477 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100478 if (ch < 256) {
479 PyObject *latin1_char = unicode_latin1[ch];
480 if (latin1_char != NULL) {
481 if (unicode != latin1_char) {
482 Py_INCREF(latin1_char);
483 Py_DECREF(unicode);
484 }
485 return latin1_char;
486 }
487 else {
488 assert(_PyUnicode_CheckConsistency(unicode, 1));
489 Py_INCREF(unicode);
490 unicode_latin1[ch] = unicode;
491 return unicode;
492 }
493 }
494 }
495
496 assert(_PyUnicode_CheckConsistency(unicode, 1));
497 return unicode;
498}
499
500static PyObject*
501unicode_result(PyObject *unicode)
502{
503 assert(_PyUnicode_CHECK(unicode));
504 if (PyUnicode_IS_READY(unicode))
505 return unicode_result_ready(unicode);
506 else
507 return unicode_result_wchar(unicode);
508}
509
Victor Stinnerc4b49542011-12-11 22:44:26 +0100510static PyObject*
511unicode_result_unchanged(PyObject *unicode)
512{
513 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500514 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100515 return NULL;
516 Py_INCREF(unicode);
517 return unicode;
518 }
519 else
520 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100521 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100522}
523
Thomas Wouters477c8d52006-05-27 19:21:47 +0000524/* --- Bloom Filters ----------------------------------------------------- */
525
526/* stuff to implement simple "bloom filters" for Unicode characters.
527 to keep things simple, we use a single bitmask, using the least 5
528 bits from each unicode characters as the bit index. */
529
530/* the linebreak mask is set up by Unicode_Init below */
531
Antoine Pitrouf068f942010-01-13 14:19:12 +0000532#if LONG_BIT >= 128
533#define BLOOM_WIDTH 128
534#elif LONG_BIT >= 64
535#define BLOOM_WIDTH 64
536#elif LONG_BIT >= 32
537#define BLOOM_WIDTH 32
538#else
539#error "LONG_BIT is smaller than 32"
540#endif
541
Thomas Wouters477c8d52006-05-27 19:21:47 +0000542#define BLOOM_MASK unsigned long
543
Serhiy Storchaka05997252013-01-26 12:14:02 +0200544static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000545
Antoine Pitrouf068f942010-01-13 14:19:12 +0000546#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000547
Benjamin Peterson29060642009-01-31 22:14:21 +0000548#define BLOOM_LINEBREAK(ch) \
549 ((ch) < 128U ? ascii_linebreak[(ch)] : \
550 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000551
Alexander Belopolsky40018472011-02-26 01:02:56 +0000552Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200553make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000554{
Victor Stinnera85af502013-04-09 21:53:54 +0200555#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
556 do { \
557 TYPE *data = (TYPE *)PTR; \
558 TYPE *end = data + LEN; \
559 Py_UCS4 ch; \
560 for (; data != end; data++) { \
561 ch = *data; \
562 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
563 } \
564 break; \
565 } while (0)
566
Thomas Wouters477c8d52006-05-27 19:21:47 +0000567 /* calculate simple bloom-style bitmask for a given unicode string */
568
Antoine Pitrouf068f942010-01-13 14:19:12 +0000569 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000570
571 mask = 0;
Victor Stinnera85af502013-04-09 21:53:54 +0200572 switch (kind) {
573 case PyUnicode_1BYTE_KIND:
574 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
575 break;
576 case PyUnicode_2BYTE_KIND:
577 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
578 break;
579 case PyUnicode_4BYTE_KIND:
580 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
581 break;
582 default:
583 assert(0);
584 }
Thomas Wouters477c8d52006-05-27 19:21:47 +0000585 return mask;
Victor Stinnera85af502013-04-09 21:53:54 +0200586
587#undef BLOOM_UPDATE
Thomas Wouters477c8d52006-05-27 19:21:47 +0000588}
589
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200590/* Compilation of templated routines */
591
592#include "stringlib/asciilib.h"
593#include "stringlib/fastsearch.h"
594#include "stringlib/partition.h"
595#include "stringlib/split.h"
596#include "stringlib/count.h"
597#include "stringlib/find.h"
598#include "stringlib/find_max_char.h"
599#include "stringlib/localeutil.h"
600#include "stringlib/undef.h"
601
602#include "stringlib/ucs1lib.h"
603#include "stringlib/fastsearch.h"
604#include "stringlib/partition.h"
605#include "stringlib/split.h"
606#include "stringlib/count.h"
607#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300608#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200609#include "stringlib/find_max_char.h"
610#include "stringlib/localeutil.h"
611#include "stringlib/undef.h"
612
613#include "stringlib/ucs2lib.h"
614#include "stringlib/fastsearch.h"
615#include "stringlib/partition.h"
616#include "stringlib/split.h"
617#include "stringlib/count.h"
618#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300619#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200620#include "stringlib/find_max_char.h"
621#include "stringlib/localeutil.h"
622#include "stringlib/undef.h"
623
624#include "stringlib/ucs4lib.h"
625#include "stringlib/fastsearch.h"
626#include "stringlib/partition.h"
627#include "stringlib/split.h"
628#include "stringlib/count.h"
629#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300630#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200631#include "stringlib/find_max_char.h"
632#include "stringlib/localeutil.h"
633#include "stringlib/undef.h"
634
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200635#include "stringlib/unicodedefs.h"
636#include "stringlib/fastsearch.h"
637#include "stringlib/count.h"
638#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100639#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200640
Guido van Rossumd57fd912000-03-10 22:53:23 +0000641/* --- Unicode Object ----------------------------------------------------- */
642
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200643static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200644fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200645
Serhiy Storchakad9d769f2015-03-24 21:55:47 +0200646Py_LOCAL_INLINE(Py_ssize_t) findchar(const void *s, int kind,
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200647 Py_ssize_t size, Py_UCS4 ch,
648 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200649{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200650 int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH;
651
652 switch (kind) {
653 case PyUnicode_1BYTE_KIND:
654 {
655 Py_UCS1 ch1 = (Py_UCS1) ch;
656 if (ch1 == ch)
657 return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode);
658 else
659 return -1;
660 }
661 case PyUnicode_2BYTE_KIND:
662 {
663 Py_UCS2 ch2 = (Py_UCS2) ch;
664 if (ch2 == ch)
665 return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode);
666 else
667 return -1;
668 }
669 case PyUnicode_4BYTE_KIND:
670 return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode);
671 default:
672 assert(0);
673 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200674 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200675}
676
Victor Stinnerafffce42012-10-03 23:03:17 +0200677#ifdef Py_DEBUG
Martin Panter6245cb32016-04-15 02:14:19 +0000678/* Fill the data of a Unicode string with invalid characters to detect bugs
Victor Stinnerafffce42012-10-03 23:03:17 +0200679 earlier.
680
681 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
682 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
683 invalid character in Unicode 6.0. */
684static void
685unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
686{
687 int kind = PyUnicode_KIND(unicode);
688 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
689 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
690 if (length <= old_length)
691 return;
692 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
693}
694#endif
695
Victor Stinnerfe226c02011-10-03 03:52:20 +0200696static PyObject*
697resize_compact(PyObject *unicode, Py_ssize_t length)
698{
699 Py_ssize_t char_size;
700 Py_ssize_t struct_size;
701 Py_ssize_t new_size;
702 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100703 PyObject *new_unicode;
Victor Stinnerafffce42012-10-03 23:03:17 +0200704#ifdef Py_DEBUG
705 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
706#endif
707
Victor Stinner79891572012-05-03 13:43:07 +0200708 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200709 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +0100710 assert(PyUnicode_IS_COMPACT(unicode));
711
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200712 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100713 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +0200714 struct_size = sizeof(PyASCIIObject);
715 else
716 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200717 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200718
Victor Stinnerfe226c02011-10-03 03:52:20 +0200719 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
720 PyErr_NoMemory();
721 return NULL;
722 }
723 new_size = (struct_size + (length + 1) * char_size);
724
Serhiy Storchaka7aa69082015-12-03 01:02:03 +0200725 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
726 PyObject_DEL(_PyUnicode_UTF8(unicode));
727 _PyUnicode_UTF8(unicode) = NULL;
728 _PyUnicode_UTF8_LENGTH(unicode) = 0;
729 }
Victor Stinner84def372011-12-11 20:04:56 +0100730 _Py_DEC_REFTOTAL;
731 _Py_ForgetReference(unicode);
732
Serhiy Storchaka20b39b22014-09-28 11:27:24 +0300733 new_unicode = (PyObject *)PyObject_REALLOC(unicode, new_size);
Victor Stinner84def372011-12-11 20:04:56 +0100734 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100735 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200736 PyErr_NoMemory();
737 return NULL;
738 }
Victor Stinner84def372011-12-11 20:04:56 +0100739 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200740 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +0100741
Victor Stinnerfe226c02011-10-03 03:52:20 +0200742 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200743 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200744 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100745 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +0200746 _PyUnicode_WSTR_LENGTH(unicode) = length;
747 }
Victor Stinnerbbbac2e2013-02-07 23:12:46 +0100748 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
749 PyObject_DEL(_PyUnicode_WSTR(unicode));
750 _PyUnicode_WSTR(unicode) = NULL;
Victor Stinner5bc03a62016-01-27 16:56:53 +0100751 if (!PyUnicode_IS_ASCII(unicode))
752 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Victor Stinnerbbbac2e2013-02-07 23:12:46 +0100753 }
Victor Stinnerafffce42012-10-03 23:03:17 +0200754#ifdef Py_DEBUG
755 unicode_fill_invalid(unicode, old_length);
756#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200757 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
758 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +0200759 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200760 return unicode;
761}
762
Alexander Belopolsky40018472011-02-26 01:02:56 +0000763static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200764resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000765{
Victor Stinner95663112011-10-04 01:03:50 +0200766 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100767 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200768 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200769 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000770
Victor Stinnerfe226c02011-10-03 03:52:20 +0200771 if (PyUnicode_IS_READY(unicode)) {
772 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200773 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200774 void *data;
Victor Stinnerafffce42012-10-03 23:03:17 +0200775#ifdef Py_DEBUG
776 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
777#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200778
779 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200780 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200781 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
782 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200783
784 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
785 PyErr_NoMemory();
786 return -1;
787 }
788 new_size = (length + 1) * char_size;
789
Victor Stinner7a9105a2011-12-12 00:13:42 +0100790 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
791 {
792 PyObject_DEL(_PyUnicode_UTF8(unicode));
793 _PyUnicode_UTF8(unicode) = NULL;
794 _PyUnicode_UTF8_LENGTH(unicode) = 0;
795 }
796
Victor Stinnerfe226c02011-10-03 03:52:20 +0200797 data = (PyObject *)PyObject_REALLOC(data, new_size);
798 if (data == NULL) {
799 PyErr_NoMemory();
800 return -1;
801 }
802 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200803 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200804 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200805 _PyUnicode_WSTR_LENGTH(unicode) = length;
806 }
807 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200808 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200809 _PyUnicode_UTF8_LENGTH(unicode) = length;
810 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200811 _PyUnicode_LENGTH(unicode) = length;
812 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinnerafffce42012-10-03 23:03:17 +0200813#ifdef Py_DEBUG
814 unicode_fill_invalid(unicode, old_length);
815#endif
Victor Stinner95663112011-10-04 01:03:50 +0200816 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200817 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200818 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200819 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200820 }
Victor Stinner95663112011-10-04 01:03:50 +0200821 assert(_PyUnicode_WSTR(unicode) != NULL);
822
823 /* check for integer overflow */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -0700824 if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
Victor Stinner95663112011-10-04 01:03:50 +0200825 PyErr_NoMemory();
826 return -1;
827 }
Victor Stinner7a9105a2011-12-12 00:13:42 +0100828 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +0200829 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +0100830 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +0200831 if (!wstr) {
832 PyErr_NoMemory();
833 return -1;
834 }
835 _PyUnicode_WSTR(unicode) = wstr;
836 _PyUnicode_WSTR(unicode)[length] = 0;
837 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200838 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000839 return 0;
840}
841
Victor Stinnerfe226c02011-10-03 03:52:20 +0200842static PyObject*
843resize_copy(PyObject *unicode, Py_ssize_t length)
844{
845 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100846 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200847 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100848
Benjamin Petersonbac79492012-01-14 13:34:47 -0500849 if (PyUnicode_READY(unicode) == -1)
Victor Stinner7a9105a2011-12-12 00:13:42 +0100850 return NULL;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200851
852 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
853 if (copy == NULL)
854 return NULL;
855
856 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +0200857 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200858 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200859 }
860 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200861 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100862
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200863 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200864 if (w == NULL)
865 return NULL;
866 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
867 copy_length = Py_MIN(copy_length, length);
Victor Stinnerc6cf1ba2012-10-23 02:54:47 +0200868 Py_MEMCPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
869 copy_length * sizeof(wchar_t));
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200870 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200871 }
872}
873
Guido van Rossumd57fd912000-03-10 22:53:23 +0000874/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000875 Ux0000 terminated; some code (e.g. new_identifier)
876 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000877
878 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000879 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000880
881*/
882
Alexander Belopolsky40018472011-02-26 01:02:56 +0000883static PyUnicodeObject *
884_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000885{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +0200886 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200887 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000888
Thomas Wouters477c8d52006-05-27 19:21:47 +0000889 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000890 if (length == 0 && unicode_empty != NULL) {
891 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200892 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000893 }
894
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000895 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -0700896 if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000897 return (PyUnicodeObject *)PyErr_NoMemory();
898 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200899 if (length < 0) {
900 PyErr_SetString(PyExc_SystemError,
901 "Negative size passed to _PyUnicode_New");
902 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000903 }
904
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200905 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
906 if (unicode == NULL)
907 return NULL;
908 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
Victor Stinner68b674c2013-10-29 19:31:43 +0100909
910 _PyUnicode_WSTR_LENGTH(unicode) = length;
911 _PyUnicode_HASH(unicode) = -1;
912 _PyUnicode_STATE(unicode).interned = 0;
913 _PyUnicode_STATE(unicode).kind = 0;
914 _PyUnicode_STATE(unicode).compact = 0;
915 _PyUnicode_STATE(unicode).ready = 0;
916 _PyUnicode_STATE(unicode).ascii = 0;
917 _PyUnicode_DATA_ANY(unicode) = NULL;
918 _PyUnicode_LENGTH(unicode) = 0;
919 _PyUnicode_UTF8(unicode) = NULL;
920 _PyUnicode_UTF8_LENGTH(unicode) = 0;
921
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200922 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
923 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100924 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +0000925 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100926 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000927 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200928
Jeremy Hyltond8082792003-09-16 19:41:39 +0000929 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000930 * the caller fails before initializing str -- unicode_resize()
931 * reads str[0], and the Keep-Alive optimization can keep memory
932 * allocated for str alive across a call to unicode_dealloc(unicode).
933 * We don't want unicode_resize to read uninitialized memory in
934 * that case.
935 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200936 _PyUnicode_WSTR(unicode)[0] = 0;
937 _PyUnicode_WSTR(unicode)[length] = 0;
Victor Stinner68b674c2013-10-29 19:31:43 +0100938
Victor Stinner7931d9a2011-11-04 00:22:48 +0100939 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000940 return unicode;
941}
942
Victor Stinnerf42dc442011-10-02 23:33:16 +0200943static const char*
944unicode_kind_name(PyObject *unicode)
945{
Victor Stinner42dfd712011-10-03 14:41:45 +0200946 /* don't check consistency: unicode_kind_name() is called from
947 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +0200948 if (!PyUnicode_IS_COMPACT(unicode))
949 {
950 if (!PyUnicode_IS_READY(unicode))
951 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -0600952 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200953 {
954 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200955 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200956 return "legacy ascii";
957 else
958 return "legacy latin1";
959 case PyUnicode_2BYTE_KIND:
960 return "legacy UCS2";
961 case PyUnicode_4BYTE_KIND:
962 return "legacy UCS4";
963 default:
964 return "<legacy invalid kind>";
965 }
966 }
967 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -0600968 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +0200969 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200970 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200971 return "ascii";
972 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200973 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200974 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200975 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200976 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200977 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200978 default:
979 return "<invalid compact kind>";
980 }
981}
982
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200983#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200984/* Functions wrapping macros for use in debugger */
985char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200986 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200987}
988
989void *_PyUnicode_compact_data(void *unicode) {
990 return _PyUnicode_COMPACT_DATA(unicode);
991}
992void *_PyUnicode_data(void *unicode){
993 printf("obj %p\n", unicode);
994 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
995 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
996 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
997 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
998 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
999 return PyUnicode_DATA(unicode);
1000}
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001001
1002void
1003_PyUnicode_Dump(PyObject *op)
1004{
1005 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +02001006 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1007 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
1008 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +02001009
Victor Stinnera849a4b2011-10-03 12:12:11 +02001010 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +02001011 {
1012 if (ascii->state.ascii)
1013 data = (ascii + 1);
1014 else
1015 data = (compact + 1);
1016 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001017 else
1018 data = unicode->data.any;
Victor Stinner293f3f52014-07-01 08:57:10 +02001019 printf("%s: len=%" PY_FORMAT_SIZE_T "u, ",
1020 unicode_kind_name(op), ascii->length);
Victor Stinner0d60e872011-10-23 19:47:19 +02001021
Victor Stinnera849a4b2011-10-03 12:12:11 +02001022 if (ascii->wstr == data)
1023 printf("shared ");
1024 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +02001025
Victor Stinnera3b334d2011-10-03 13:53:37 +02001026 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinner293f3f52014-07-01 08:57:10 +02001027 printf(" (%" PY_FORMAT_SIZE_T "u), ", compact->wstr_length);
Victor Stinnera849a4b2011-10-03 12:12:11 +02001028 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
1029 printf("shared ");
Victor Stinner293f3f52014-07-01 08:57:10 +02001030 printf("utf8=%p (%" PY_FORMAT_SIZE_T "u)",
1031 compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001032 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001033 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001034}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001035#endif
1036
1037PyObject *
1038PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1039{
1040 PyObject *obj;
1041 PyCompactUnicodeObject *unicode;
1042 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +02001043 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001044 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001045 Py_ssize_t char_size;
1046 Py_ssize_t struct_size;
1047
1048 /* Optimization for empty strings */
1049 if (size == 0 && unicode_empty != NULL) {
1050 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001051 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001052 }
1053
Victor Stinner9e9d6892011-10-04 01:02:02 +02001054 is_ascii = 0;
1055 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001056 struct_size = sizeof(PyCompactUnicodeObject);
1057 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +02001058 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001059 char_size = 1;
1060 is_ascii = 1;
1061 struct_size = sizeof(PyASCIIObject);
1062 }
1063 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001064 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001065 char_size = 1;
1066 }
1067 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001068 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001069 char_size = 2;
1070 if (sizeof(wchar_t) == 2)
1071 is_sharing = 1;
1072 }
1073 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001074 if (maxchar > MAX_UNICODE) {
1075 PyErr_SetString(PyExc_SystemError,
1076 "invalid maximum character passed to PyUnicode_New");
1077 return NULL;
1078 }
Victor Stinner8f825062012-04-27 13:55:39 +02001079 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001080 char_size = 4;
1081 if (sizeof(wchar_t) == 4)
1082 is_sharing = 1;
1083 }
1084
1085 /* Ensure we won't overflow the size. */
1086 if (size < 0) {
1087 PyErr_SetString(PyExc_SystemError,
1088 "Negative size passed to PyUnicode_New");
1089 return NULL;
1090 }
1091 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1092 return PyErr_NoMemory();
1093
1094 /* Duplicated allocation code from _PyObject_New() instead of a call to
1095 * PyObject_New() so we are able to allocate space for the object and
1096 * it's data buffer.
1097 */
1098 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1099 if (obj == NULL)
1100 return PyErr_NoMemory();
1101 obj = PyObject_INIT(obj, &PyUnicode_Type);
1102 if (obj == NULL)
1103 return NULL;
1104
1105 unicode = (PyCompactUnicodeObject *)obj;
1106 if (is_ascii)
1107 data = ((PyASCIIObject*)obj) + 1;
1108 else
1109 data = unicode + 1;
1110 _PyUnicode_LENGTH(unicode) = size;
1111 _PyUnicode_HASH(unicode) = -1;
1112 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001113 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001114 _PyUnicode_STATE(unicode).compact = 1;
1115 _PyUnicode_STATE(unicode).ready = 1;
1116 _PyUnicode_STATE(unicode).ascii = is_ascii;
1117 if (is_ascii) {
1118 ((char*)data)[size] = 0;
1119 _PyUnicode_WSTR(unicode) = NULL;
1120 }
Victor Stinner8f825062012-04-27 13:55:39 +02001121 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001122 ((char*)data)[size] = 0;
1123 _PyUnicode_WSTR(unicode) = NULL;
1124 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001125 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001126 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001127 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001128 else {
1129 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001130 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001131 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001132 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001133 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001134 ((Py_UCS4*)data)[size] = 0;
1135 if (is_sharing) {
1136 _PyUnicode_WSTR_LENGTH(unicode) = size;
1137 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1138 }
1139 else {
1140 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1141 _PyUnicode_WSTR(unicode) = NULL;
1142 }
1143 }
Victor Stinner8f825062012-04-27 13:55:39 +02001144#ifdef Py_DEBUG
Victor Stinnerafffce42012-10-03 23:03:17 +02001145 unicode_fill_invalid((PyObject*)unicode, 0);
Victor Stinner8f825062012-04-27 13:55:39 +02001146#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001147 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001148 return obj;
1149}
1150
1151#if SIZEOF_WCHAR_T == 2
1152/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1153 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001154 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001155
1156 This function assumes that unicode can hold one more code point than wstr
1157 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001158static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001159unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001160 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001161{
1162 const wchar_t *iter;
1163 Py_UCS4 *ucs4_out;
1164
Victor Stinner910337b2011-10-03 03:20:16 +02001165 assert(unicode != NULL);
1166 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001167 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1168 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1169
1170 for (iter = begin; iter < end; ) {
1171 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1172 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001173 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1174 && (iter+1) < end
1175 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001176 {
Victor Stinner551ac952011-11-29 22:58:13 +01001177 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001178 iter += 2;
1179 }
1180 else {
1181 *ucs4_out++ = *iter;
1182 iter++;
1183 }
1184 }
1185 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1186 _PyUnicode_GET_LENGTH(unicode)));
1187
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001188}
1189#endif
1190
Victor Stinnercd9950f2011-10-02 00:34:53 +02001191static int
Victor Stinner488fa492011-12-12 00:01:39 +01001192unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001193{
Victor Stinner488fa492011-12-12 00:01:39 +01001194 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001195 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001196 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001197 return -1;
1198 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001199 return 0;
1200}
1201
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001202static int
1203_copy_characters(PyObject *to, Py_ssize_t to_start,
1204 PyObject *from, Py_ssize_t from_start,
1205 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001206{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001207 unsigned int from_kind, to_kind;
1208 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001209
Victor Stinneree4544c2012-05-09 22:24:08 +02001210 assert(0 <= how_many);
1211 assert(0 <= from_start);
1212 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001213 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001214 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001215 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001216
Victor Stinnerd3f08822012-05-29 12:57:52 +02001217 assert(PyUnicode_Check(to));
1218 assert(PyUnicode_IS_READY(to));
1219 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1220
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001221 if (how_many == 0)
1222 return 0;
1223
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001224 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001225 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001226 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001227 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001228
Victor Stinnerf1852262012-06-16 16:38:26 +02001229#ifdef Py_DEBUG
1230 if (!check_maxchar
1231 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1232 {
1233 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1234 Py_UCS4 ch;
1235 Py_ssize_t i;
1236 for (i=0; i < how_many; i++) {
1237 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1238 assert(ch <= to_maxchar);
1239 }
1240 }
1241#endif
1242
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001243 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001244 if (check_maxchar
1245 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1246 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001247 /* Writing Latin-1 characters into an ASCII string requires to
1248 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001249 Py_UCS4 max_char;
1250 max_char = ucs1lib_find_max_char(from_data,
1251 (Py_UCS1*)from_data + how_many);
1252 if (max_char >= 128)
1253 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001254 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001255 Py_MEMCPY((char*)to_data + to_kind * to_start,
1256 (char*)from_data + from_kind * from_start,
1257 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001258 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001259 else if (from_kind == PyUnicode_1BYTE_KIND
1260 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001261 {
1262 _PyUnicode_CONVERT_BYTES(
1263 Py_UCS1, Py_UCS2,
1264 PyUnicode_1BYTE_DATA(from) + from_start,
1265 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1266 PyUnicode_2BYTE_DATA(to) + to_start
1267 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001268 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001269 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001270 && to_kind == PyUnicode_4BYTE_KIND)
1271 {
1272 _PyUnicode_CONVERT_BYTES(
1273 Py_UCS1, Py_UCS4,
1274 PyUnicode_1BYTE_DATA(from) + from_start,
1275 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1276 PyUnicode_4BYTE_DATA(to) + to_start
1277 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001278 }
1279 else if (from_kind == PyUnicode_2BYTE_KIND
1280 && to_kind == PyUnicode_4BYTE_KIND)
1281 {
1282 _PyUnicode_CONVERT_BYTES(
1283 Py_UCS2, Py_UCS4,
1284 PyUnicode_2BYTE_DATA(from) + from_start,
1285 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1286 PyUnicode_4BYTE_DATA(to) + to_start
1287 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001288 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001289 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001290 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1291
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001292 if (!check_maxchar) {
1293 if (from_kind == PyUnicode_2BYTE_KIND
1294 && to_kind == PyUnicode_1BYTE_KIND)
1295 {
1296 _PyUnicode_CONVERT_BYTES(
1297 Py_UCS2, Py_UCS1,
1298 PyUnicode_2BYTE_DATA(from) + from_start,
1299 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1300 PyUnicode_1BYTE_DATA(to) + to_start
1301 );
1302 }
1303 else if (from_kind == PyUnicode_4BYTE_KIND
1304 && to_kind == PyUnicode_1BYTE_KIND)
1305 {
1306 _PyUnicode_CONVERT_BYTES(
1307 Py_UCS4, Py_UCS1,
1308 PyUnicode_4BYTE_DATA(from) + from_start,
1309 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1310 PyUnicode_1BYTE_DATA(to) + to_start
1311 );
1312 }
1313 else if (from_kind == PyUnicode_4BYTE_KIND
1314 && to_kind == PyUnicode_2BYTE_KIND)
1315 {
1316 _PyUnicode_CONVERT_BYTES(
1317 Py_UCS4, Py_UCS2,
1318 PyUnicode_4BYTE_DATA(from) + from_start,
1319 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1320 PyUnicode_2BYTE_DATA(to) + to_start
1321 );
1322 }
1323 else {
1324 assert(0);
1325 return -1;
1326 }
1327 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001328 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001329 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001330 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001331 Py_ssize_t i;
1332
Victor Stinnera0702ab2011-09-29 14:14:38 +02001333 for (i=0; i < how_many; i++) {
1334 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001335 if (ch > to_maxchar)
1336 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001337 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1338 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001339 }
1340 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001341 return 0;
1342}
1343
Victor Stinnerd3f08822012-05-29 12:57:52 +02001344void
1345_PyUnicode_FastCopyCharacters(
1346 PyObject *to, Py_ssize_t to_start,
1347 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001348{
1349 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1350}
1351
1352Py_ssize_t
1353PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1354 PyObject *from, Py_ssize_t from_start,
1355 Py_ssize_t how_many)
1356{
1357 int err;
1358
1359 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1360 PyErr_BadInternalCall();
1361 return -1;
1362 }
1363
Benjamin Petersonbac79492012-01-14 13:34:47 -05001364 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001365 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001366 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001367 return -1;
1368
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001369 if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001370 PyErr_SetString(PyExc_IndexError, "string index out of range");
1371 return -1;
1372 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001373 if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001374 PyErr_SetString(PyExc_IndexError, "string index out of range");
1375 return -1;
1376 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001377 if (how_many < 0) {
1378 PyErr_SetString(PyExc_SystemError, "how_many cannot be negative");
1379 return -1;
1380 }
1381 how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001382 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1383 PyErr_Format(PyExc_SystemError,
Victor Stinnera33bce02014-07-04 22:47:46 +02001384 "Cannot write %zi characters at %zi "
1385 "in a string of %zi characters",
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001386 how_many, to_start, PyUnicode_GET_LENGTH(to));
1387 return -1;
1388 }
1389
1390 if (how_many == 0)
1391 return 0;
1392
Victor Stinner488fa492011-12-12 00:01:39 +01001393 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001394 return -1;
1395
1396 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1397 if (err) {
1398 PyErr_Format(PyExc_SystemError,
1399 "Cannot copy %s characters "
1400 "into a string of %s characters",
1401 unicode_kind_name(from),
1402 unicode_kind_name(to));
1403 return -1;
1404 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001405 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001406}
1407
Victor Stinner17222162011-09-28 22:15:37 +02001408/* Find the maximum code point and count the number of surrogate pairs so a
1409 correct string length can be computed before converting a string to UCS4.
1410 This function counts single surrogates as a character and not as a pair.
1411
1412 Return 0 on success, or -1 on error. */
1413static int
1414find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1415 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001416{
1417 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001418 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001419
Victor Stinnerc53be962011-10-02 21:33:54 +02001420 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001421 *num_surrogates = 0;
1422 *maxchar = 0;
1423
1424 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001425#if SIZEOF_WCHAR_T == 2
Victor Stinnercf77da92013-03-06 01:09:24 +01001426 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1427 && (iter+1) < end
1428 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1429 {
1430 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1431 ++(*num_surrogates);
1432 iter += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001433 }
1434 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001435#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001436 {
1437 ch = *iter;
1438 iter++;
1439 }
1440 if (ch > *maxchar) {
1441 *maxchar = ch;
1442 if (*maxchar > MAX_UNICODE) {
1443 PyErr_Format(PyExc_ValueError,
1444 "character U+%x is not in range [U+0000; U+10ffff]",
1445 ch);
1446 return -1;
1447 }
1448 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001449 }
1450 return 0;
1451}
1452
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001453int
1454_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001455{
1456 wchar_t *end;
1457 Py_UCS4 maxchar = 0;
1458 Py_ssize_t num_surrogates;
1459#if SIZEOF_WCHAR_T == 2
1460 Py_ssize_t length_wo_surrogates;
1461#endif
1462
Georg Brandl7597add2011-10-05 16:36:47 +02001463 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001464 strings were created using _PyObject_New() and where no canonical
1465 representation (the str field) has been set yet aka strings
1466 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001467 assert(_PyUnicode_CHECK(unicode));
1468 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001469 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001470 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001471 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001472 /* Actually, it should neither be interned nor be anything else: */
1473 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001474
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001475 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001476 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001477 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001478 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001479
1480 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001481 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1482 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001483 PyErr_NoMemory();
1484 return -1;
1485 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001486 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001487 _PyUnicode_WSTR(unicode), end,
1488 PyUnicode_1BYTE_DATA(unicode));
1489 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1490 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1491 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1492 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001493 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001494 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001495 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001496 }
1497 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001498 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001499 _PyUnicode_UTF8(unicode) = NULL;
1500 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001501 }
1502 PyObject_FREE(_PyUnicode_WSTR(unicode));
1503 _PyUnicode_WSTR(unicode) = NULL;
1504 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1505 }
1506 /* In this case we might have to convert down from 4-byte native
1507 wchar_t to 2-byte unicode. */
1508 else if (maxchar < 65536) {
1509 assert(num_surrogates == 0 &&
1510 "FindMaxCharAndNumSurrogatePairs() messed up");
1511
Victor Stinner506f5922011-09-28 22:34:18 +02001512#if SIZEOF_WCHAR_T == 2
1513 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001514 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001515 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1516 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1517 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001518 _PyUnicode_UTF8(unicode) = NULL;
1519 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001520#else
1521 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001522 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001523 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001524 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001525 PyErr_NoMemory();
1526 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001527 }
Victor Stinner506f5922011-09-28 22:34:18 +02001528 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1529 _PyUnicode_WSTR(unicode), end,
1530 PyUnicode_2BYTE_DATA(unicode));
1531 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1532 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1533 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001534 _PyUnicode_UTF8(unicode) = NULL;
1535 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001536 PyObject_FREE(_PyUnicode_WSTR(unicode));
1537 _PyUnicode_WSTR(unicode) = NULL;
1538 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1539#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001540 }
1541 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1542 else {
1543#if SIZEOF_WCHAR_T == 2
1544 /* in case the native representation is 2-bytes, we need to allocate a
1545 new normalized 4-byte version. */
1546 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Serhiy Storchakae55181f2015-02-20 21:34:06 +02001547 if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) {
1548 PyErr_NoMemory();
1549 return -1;
1550 }
Victor Stinnerc3c74152011-10-02 20:39:55 +02001551 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1552 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001553 PyErr_NoMemory();
1554 return -1;
1555 }
1556 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1557 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001558 _PyUnicode_UTF8(unicode) = NULL;
1559 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001560 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1561 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001562 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001563 PyObject_FREE(_PyUnicode_WSTR(unicode));
1564 _PyUnicode_WSTR(unicode) = NULL;
1565 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1566#else
1567 assert(num_surrogates == 0);
1568
Victor Stinnerc3c74152011-10-02 20:39:55 +02001569 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001570 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001571 _PyUnicode_UTF8(unicode) = NULL;
1572 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001573 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1574#endif
1575 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1576 }
1577 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001578 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001579 return 0;
1580}
1581
Alexander Belopolsky40018472011-02-26 01:02:56 +00001582static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001583unicode_dealloc(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001584{
Walter Dörwald16807132007-05-25 13:52:07 +00001585 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001586 case SSTATE_NOT_INTERNED:
1587 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001588
Benjamin Peterson29060642009-01-31 22:14:21 +00001589 case SSTATE_INTERNED_MORTAL:
1590 /* revive dead object temporarily for DelItem */
1591 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001592 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001593 Py_FatalError(
1594 "deletion of interned string failed");
1595 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001596
Benjamin Peterson29060642009-01-31 22:14:21 +00001597 case SSTATE_INTERNED_IMMORTAL:
1598 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001599
Benjamin Peterson29060642009-01-31 22:14:21 +00001600 default:
1601 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001602 }
1603
Victor Stinner03490912011-10-03 23:45:12 +02001604 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001605 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001606 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001607 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001608 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1609 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001610
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001611 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001612}
1613
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001614#ifdef Py_DEBUG
1615static int
1616unicode_is_singleton(PyObject *unicode)
1617{
1618 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1619 if (unicode == unicode_empty)
1620 return 1;
1621 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1622 {
1623 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1624 if (ch < 256 && unicode_latin1[ch] == unicode)
1625 return 1;
1626 }
1627 return 0;
1628}
1629#endif
1630
Alexander Belopolsky40018472011-02-26 01:02:56 +00001631static int
Victor Stinner488fa492011-12-12 00:01:39 +01001632unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001633{
Victor Stinner488fa492011-12-12 00:01:39 +01001634 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001635 if (Py_REFCNT(unicode) != 1)
1636 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001637 if (_PyUnicode_HASH(unicode) != -1)
1638 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001639 if (PyUnicode_CHECK_INTERNED(unicode))
1640 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001641 if (!PyUnicode_CheckExact(unicode))
1642 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001643#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001644 /* singleton refcount is greater than 1 */
1645 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001646#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001647 return 1;
1648}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001649
Victor Stinnerfe226c02011-10-03 03:52:20 +02001650static int
1651unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1652{
1653 PyObject *unicode;
1654 Py_ssize_t old_length;
1655
1656 assert(p_unicode != NULL);
1657 unicode = *p_unicode;
1658
1659 assert(unicode != NULL);
1660 assert(PyUnicode_Check(unicode));
1661 assert(0 <= length);
1662
Victor Stinner910337b2011-10-03 03:20:16 +02001663 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001664 old_length = PyUnicode_WSTR_LENGTH(unicode);
1665 else
1666 old_length = PyUnicode_GET_LENGTH(unicode);
1667 if (old_length == length)
1668 return 0;
1669
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001670 if (length == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02001671 _Py_INCREF_UNICODE_EMPTY();
1672 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00001673 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03001674 Py_SETREF(*p_unicode, unicode_empty);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001675 return 0;
1676 }
1677
Victor Stinner488fa492011-12-12 00:01:39 +01001678 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001679 PyObject *copy = resize_copy(unicode, length);
1680 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001681 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03001682 Py_SETREF(*p_unicode, copy);
Benjamin Peterson29060642009-01-31 22:14:21 +00001683 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001684 }
1685
Victor Stinnerfe226c02011-10-03 03:52:20 +02001686 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001687 PyObject *new_unicode = resize_compact(unicode, length);
1688 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001689 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001690 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001691 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001692 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001693 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001694}
1695
Alexander Belopolsky40018472011-02-26 01:02:56 +00001696int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001697PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001698{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001699 PyObject *unicode;
1700 if (p_unicode == NULL) {
1701 PyErr_BadInternalCall();
1702 return -1;
1703 }
1704 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001705 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001706 {
1707 PyErr_BadInternalCall();
1708 return -1;
1709 }
1710 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001711}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001712
Serhiy Storchakad65c9492015-11-02 14:10:23 +02001713/* Copy an ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01001714
Victor Stinnerb429d3b2012-02-22 21:22:20 +01001715 WARNING: The function doesn't copy the terminating null character and
1716 doesn't check the maximum character (may write a latin1 character in an
1717 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02001718static void
1719unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1720 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01001721{
1722 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1723 void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02001724 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01001725
1726 switch (kind) {
1727 case PyUnicode_1BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01001728 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner8c6db452012-10-06 00:40:45 +02001729#ifdef Py_DEBUG
1730 if (PyUnicode_IS_ASCII(unicode)) {
1731 Py_UCS4 maxchar = ucs1lib_find_max_char(
1732 (const Py_UCS1*)str,
1733 (const Py_UCS1*)str + len);
1734 assert(maxchar < 128);
1735 }
1736#endif
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01001737 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02001738 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001739 }
1740 case PyUnicode_2BYTE_KIND: {
1741 Py_UCS2 *start = (Py_UCS2 *)data + index;
1742 Py_UCS2 *ucs2 = start;
1743 assert(index <= PyUnicode_GET_LENGTH(unicode));
1744
Victor Stinner184252a2012-06-16 02:57:41 +02001745 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001746 *ucs2 = (Py_UCS2)*str;
1747
1748 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02001749 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001750 }
1751 default: {
1752 Py_UCS4 *start = (Py_UCS4 *)data + index;
1753 Py_UCS4 *ucs4 = start;
1754 assert(kind == PyUnicode_4BYTE_KIND);
1755 assert(index <= PyUnicode_GET_LENGTH(unicode));
1756
Victor Stinner184252a2012-06-16 02:57:41 +02001757 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001758 *ucs4 = (Py_UCS4)*str;
1759
1760 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01001761 }
1762 }
1763}
1764
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001765static PyObject*
1766get_latin1_char(unsigned char ch)
1767{
Victor Stinnera464fc12011-10-02 20:39:30 +02001768 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001769 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001770 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001771 if (!unicode)
1772 return NULL;
1773 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001774 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001775 unicode_latin1[ch] = unicode;
1776 }
1777 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001778 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001779}
1780
Victor Stinner985a82a2014-01-03 12:53:47 +01001781static PyObject*
1782unicode_char(Py_UCS4 ch)
1783{
1784 PyObject *unicode;
1785
1786 assert(ch <= MAX_UNICODE);
1787
Victor Stinnerf3b46b42014-01-03 13:16:00 +01001788 if (ch < 256)
1789 return get_latin1_char(ch);
1790
Victor Stinner985a82a2014-01-03 12:53:47 +01001791 unicode = PyUnicode_New(1, ch);
1792 if (unicode == NULL)
1793 return NULL;
1794 switch (PyUnicode_KIND(unicode)) {
1795 case PyUnicode_1BYTE_KIND:
1796 PyUnicode_1BYTE_DATA(unicode)[0] = (Py_UCS1)ch;
1797 break;
1798 case PyUnicode_2BYTE_KIND:
1799 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
1800 break;
1801 default:
1802 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1803 PyUnicode_4BYTE_DATA(unicode)[0] = ch;
1804 }
1805 assert(_PyUnicode_CheckConsistency(unicode, 1));
1806 return unicode;
1807}
1808
Alexander Belopolsky40018472011-02-26 01:02:56 +00001809PyObject *
1810PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001811{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001812 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001813 Py_UCS4 maxchar = 0;
1814 Py_ssize_t num_surrogates;
1815
1816 if (u == NULL)
1817 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001818
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001819 /* If the Unicode data is known at construction time, we can apply
1820 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001821
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001822 /* Optimization for empty strings */
Serhiy Storchaka678db842013-01-26 12:16:36 +02001823 if (size == 0)
1824 _Py_RETURN_UNICODE_EMPTY();
Tim Petersced69f82003-09-16 20:30:58 +00001825
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001826 /* Single character Unicode objects in the Latin-1 range are
1827 shared when using this constructor */
Victor Stinnerd21b58c2013-02-26 00:15:54 +01001828 if (size == 1 && (Py_UCS4)*u < 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001829 return get_latin1_char((unsigned char)*u);
1830
1831 /* If not empty and not single character, copy the Unicode data
1832 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001833 if (find_maxchar_surrogates(u, u + size,
1834 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001835 return NULL;
1836
Victor Stinner8faf8212011-12-08 22:14:11 +01001837 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001838 if (!unicode)
1839 return NULL;
1840
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001841 switch (PyUnicode_KIND(unicode)) {
1842 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001843 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001844 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1845 break;
1846 case PyUnicode_2BYTE_KIND:
1847#if Py_UNICODE_SIZE == 2
1848 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1849#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001850 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001851 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1852#endif
1853 break;
1854 case PyUnicode_4BYTE_KIND:
1855#if SIZEOF_WCHAR_T == 2
1856 /* This is the only case which has to process surrogates, thus
1857 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001858 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001859#else
1860 assert(num_surrogates == 0);
1861 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1862#endif
1863 break;
1864 default:
1865 assert(0 && "Impossible state");
1866 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001867
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001868 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001869}
1870
Alexander Belopolsky40018472011-02-26 01:02:56 +00001871PyObject *
1872PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001873{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001874 if (size < 0) {
1875 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001876 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001877 return NULL;
1878 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001879 if (u != NULL)
1880 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
1881 else
1882 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001883}
1884
Alexander Belopolsky40018472011-02-26 01:02:56 +00001885PyObject *
1886PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001887{
1888 size_t size = strlen(u);
1889 if (size > PY_SSIZE_T_MAX) {
1890 PyErr_SetString(PyExc_OverflowError, "input too long");
1891 return NULL;
1892 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001893 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001894}
1895
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001896PyObject *
1897_PyUnicode_FromId(_Py_Identifier *id)
1898{
1899 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01001900 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
1901 strlen(id->string),
1902 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001903 if (!id->object)
1904 return NULL;
1905 PyUnicode_InternInPlace(&id->object);
1906 assert(!id->next);
1907 id->next = static_strings;
1908 static_strings = id;
1909 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001910 return id->object;
1911}
1912
1913void
1914_PyUnicode_ClearStaticStrings()
1915{
Benjamin Peterson0c270a82013-01-09 09:52:01 -06001916 _Py_Identifier *tmp, *s = static_strings;
1917 while (s) {
Serhiy Storchaka505ff752014-02-09 13:33:53 +02001918 Py_CLEAR(s->object);
Benjamin Peterson0c270a82013-01-09 09:52:01 -06001919 tmp = s->next;
1920 s->next = NULL;
1921 s = tmp;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001922 }
Benjamin Peterson0c270a82013-01-09 09:52:01 -06001923 static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001924}
1925
Benjamin Peterson0df54292012-03-26 14:50:32 -04001926/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001927
Victor Stinnerd3f08822012-05-29 12:57:52 +02001928PyObject*
1929_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02001930{
Victor Stinnerd3f08822012-05-29 12:57:52 +02001931 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01001932 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01001933 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02001934#ifdef Py_DEBUG
Victor Stinnerd21b58c2013-02-26 00:15:54 +01001935 assert((unsigned char)s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02001936#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001937 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01001938 }
Victor Stinner785938e2011-12-11 20:09:03 +01001939 unicode = PyUnicode_New(size, 127);
1940 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02001941 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01001942 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
1943 assert(_PyUnicode_CheckConsistency(unicode, 1));
1944 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02001945}
1946
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001947static Py_UCS4
1948kind_maxchar_limit(unsigned int kind)
1949{
Benjamin Petersonead6b532011-12-20 17:23:42 -06001950 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001951 case PyUnicode_1BYTE_KIND:
1952 return 0x80;
1953 case PyUnicode_2BYTE_KIND:
1954 return 0x100;
1955 case PyUnicode_4BYTE_KIND:
1956 return 0x10000;
1957 default:
1958 assert(0 && "invalid kind");
Victor Stinner8faf8212011-12-08 22:14:11 +01001959 return MAX_UNICODE;
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001960 }
1961}
1962
Victor Stinnere6abb482012-05-02 01:15:40 +02001963Py_LOCAL_INLINE(Py_UCS4)
1964align_maxchar(Py_UCS4 maxchar)
1965{
1966 if (maxchar <= 127)
1967 return 127;
1968 else if (maxchar <= 255)
1969 return 255;
1970 else if (maxchar <= 65535)
1971 return 65535;
1972 else
1973 return MAX_UNICODE;
1974}
1975
Victor Stinner702c7342011-10-05 13:50:52 +02001976static PyObject*
Victor Stinnerd21b58c2013-02-26 00:15:54 +01001977_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001978{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001979 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001980 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001981
Serhiy Storchaka678db842013-01-26 12:16:36 +02001982 if (size == 0)
1983 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001984 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001985 if (size == 1)
1986 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001987
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001988 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001989 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001990 if (!res)
1991 return NULL;
1992 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001993 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001994 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001995}
1996
Victor Stinnere57b1c02011-09-28 22:20:48 +02001997static PyObject*
1998_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001999{
2000 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002001 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002002
Serhiy Storchaka678db842013-01-26 12:16:36 +02002003 if (size == 0)
2004 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002005 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002006 if (size == 1)
2007 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002008
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002009 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002010 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002011 if (!res)
2012 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002013 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002014 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002015 else {
2016 _PyUnicode_CONVERT_BYTES(
2017 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2018 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002019 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002020 return res;
2021}
2022
Victor Stinnere57b1c02011-09-28 22:20:48 +02002023static PyObject*
2024_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002025{
2026 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002027 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002028
Serhiy Storchaka678db842013-01-26 12:16:36 +02002029 if (size == 0)
2030 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002031 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002032 if (size == 1)
2033 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002034
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002035 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002036 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002037 if (!res)
2038 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002039 if (max_char < 256)
2040 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2041 PyUnicode_1BYTE_DATA(res));
2042 else if (max_char < 0x10000)
2043 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2044 PyUnicode_2BYTE_DATA(res));
2045 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002046 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002047 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002048 return res;
2049}
2050
2051PyObject*
2052PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2053{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002054 if (size < 0) {
2055 PyErr_SetString(PyExc_ValueError, "size must be positive");
2056 return NULL;
2057 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002058 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002059 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002060 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002061 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002062 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002063 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002064 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002065 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002066 PyErr_SetString(PyExc_SystemError, "invalid kind");
2067 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002068 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002069}
2070
Victor Stinnerece58de2012-04-23 23:36:38 +02002071Py_UCS4
2072_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2073{
2074 enum PyUnicode_Kind kind;
2075 void *startptr, *endptr;
2076
2077 assert(PyUnicode_IS_READY(unicode));
2078 assert(0 <= start);
2079 assert(end <= PyUnicode_GET_LENGTH(unicode));
2080 assert(start <= end);
2081
2082 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2083 return PyUnicode_MAX_CHAR_VALUE(unicode);
2084
2085 if (start == end)
2086 return 127;
2087
Victor Stinner94d558b2012-04-27 22:26:58 +02002088 if (PyUnicode_IS_ASCII(unicode))
2089 return 127;
2090
Victor Stinnerece58de2012-04-23 23:36:38 +02002091 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002092 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002093 endptr = (char *)startptr + end * kind;
2094 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002095 switch(kind) {
2096 case PyUnicode_1BYTE_KIND:
2097 return ucs1lib_find_max_char(startptr, endptr);
2098 case PyUnicode_2BYTE_KIND:
2099 return ucs2lib_find_max_char(startptr, endptr);
2100 case PyUnicode_4BYTE_KIND:
2101 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002102 default:
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002103 assert(0);
2104 return 0;
Victor Stinnerece58de2012-04-23 23:36:38 +02002105 }
2106}
2107
Victor Stinner25a4b292011-10-06 12:31:55 +02002108/* Ensure that a string uses the most efficient storage, if it is not the
2109 case: create a new string with of the right kind. Write NULL into *p_unicode
2110 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002111static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002112unicode_adjust_maxchar(PyObject **p_unicode)
2113{
2114 PyObject *unicode, *copy;
2115 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002116 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002117 unsigned int kind;
2118
2119 assert(p_unicode != NULL);
2120 unicode = *p_unicode;
2121 assert(PyUnicode_IS_READY(unicode));
2122 if (PyUnicode_IS_ASCII(unicode))
2123 return;
2124
2125 len = PyUnicode_GET_LENGTH(unicode);
2126 kind = PyUnicode_KIND(unicode);
2127 if (kind == PyUnicode_1BYTE_KIND) {
2128 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002129 max_char = ucs1lib_find_max_char(u, u + len);
2130 if (max_char >= 128)
2131 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002132 }
2133 else if (kind == PyUnicode_2BYTE_KIND) {
2134 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002135 max_char = ucs2lib_find_max_char(u, u + len);
2136 if (max_char >= 256)
2137 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002138 }
2139 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002140 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02002141 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002142 max_char = ucs4lib_find_max_char(u, u + len);
2143 if (max_char >= 0x10000)
2144 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002145 }
Victor Stinner25a4b292011-10-06 12:31:55 +02002146 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002147 if (copy != NULL)
2148 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002149 Py_DECREF(unicode);
2150 *p_unicode = copy;
2151}
2152
Victor Stinner034f6cf2011-09-30 02:26:44 +02002153PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002154_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002155{
Victor Stinner87af4f22011-11-21 23:03:47 +01002156 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002157 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002158
Victor Stinner034f6cf2011-09-30 02:26:44 +02002159 if (!PyUnicode_Check(unicode)) {
2160 PyErr_BadInternalCall();
2161 return NULL;
2162 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002163 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002164 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002165
Victor Stinner87af4f22011-11-21 23:03:47 +01002166 length = PyUnicode_GET_LENGTH(unicode);
2167 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002168 if (!copy)
2169 return NULL;
2170 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2171
Victor Stinner87af4f22011-11-21 23:03:47 +01002172 Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2173 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002174 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002175 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002176}
2177
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002178
Victor Stinnerbc603d12011-10-02 01:00:40 +02002179/* Widen Unicode objects to larger buffers. Don't write terminating null
2180 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002181
2182void*
2183_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2184{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002185 Py_ssize_t len;
2186 void *result;
2187 unsigned int skind;
2188
Benjamin Petersonbac79492012-01-14 13:34:47 -05002189 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02002190 return NULL;
2191
2192 len = PyUnicode_GET_LENGTH(s);
2193 skind = PyUnicode_KIND(s);
2194 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002195 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002196 return NULL;
2197 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002198 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002199 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002200 result = PyMem_New(Py_UCS2, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002201 if (!result)
2202 return PyErr_NoMemory();
2203 assert(skind == PyUnicode_1BYTE_KIND);
2204 _PyUnicode_CONVERT_BYTES(
2205 Py_UCS1, Py_UCS2,
2206 PyUnicode_1BYTE_DATA(s),
2207 PyUnicode_1BYTE_DATA(s) + len,
2208 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002209 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002210 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002211 result = PyMem_New(Py_UCS4, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002212 if (!result)
2213 return PyErr_NoMemory();
2214 if (skind == PyUnicode_2BYTE_KIND) {
2215 _PyUnicode_CONVERT_BYTES(
2216 Py_UCS2, Py_UCS4,
2217 PyUnicode_2BYTE_DATA(s),
2218 PyUnicode_2BYTE_DATA(s) + len,
2219 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002220 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002221 else {
2222 assert(skind == PyUnicode_1BYTE_KIND);
2223 _PyUnicode_CONVERT_BYTES(
2224 Py_UCS1, Py_UCS4,
2225 PyUnicode_1BYTE_DATA(s),
2226 PyUnicode_1BYTE_DATA(s) + len,
2227 result);
2228 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002229 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002230 default:
2231 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002232 }
Victor Stinner01698042011-10-04 00:04:26 +02002233 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002234 return NULL;
2235}
2236
2237static Py_UCS4*
2238as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2239 int copy_null)
2240{
2241 int kind;
2242 void *data;
2243 Py_ssize_t len, targetlen;
2244 if (PyUnicode_READY(string) == -1)
2245 return NULL;
2246 kind = PyUnicode_KIND(string);
2247 data = PyUnicode_DATA(string);
2248 len = PyUnicode_GET_LENGTH(string);
2249 targetlen = len;
2250 if (copy_null)
2251 targetlen++;
2252 if (!target) {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002253 target = PyMem_New(Py_UCS4, targetlen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002254 if (!target) {
2255 PyErr_NoMemory();
2256 return NULL;
2257 }
2258 }
2259 else {
2260 if (targetsize < targetlen) {
2261 PyErr_Format(PyExc_SystemError,
2262 "string is longer than the buffer");
2263 if (copy_null && 0 < targetsize)
2264 target[0] = 0;
2265 return NULL;
2266 }
2267 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002268 if (kind == PyUnicode_1BYTE_KIND) {
2269 Py_UCS1 *start = (Py_UCS1 *) data;
2270 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002271 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002272 else if (kind == PyUnicode_2BYTE_KIND) {
2273 Py_UCS2 *start = (Py_UCS2 *) data;
2274 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2275 }
2276 else {
2277 assert(kind == PyUnicode_4BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002278 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002279 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002280 if (copy_null)
2281 target[len] = 0;
2282 return target;
2283}
2284
2285Py_UCS4*
2286PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2287 int copy_null)
2288{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002289 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002290 PyErr_BadInternalCall();
2291 return NULL;
2292 }
2293 return as_ucs4(string, target, targetsize, copy_null);
2294}
2295
2296Py_UCS4*
2297PyUnicode_AsUCS4Copy(PyObject *string)
2298{
2299 return as_ucs4(string, NULL, 0, 1);
2300}
2301
2302#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00002303
Alexander Belopolsky40018472011-02-26 01:02:56 +00002304PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02002305PyUnicode_FromWideChar(const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002306{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002307 if (w == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002308 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02002309 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson29060642009-01-31 22:14:21 +00002310 PyErr_BadInternalCall();
2311 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002312 }
2313
Martin v. Löwis790465f2008-04-05 20:41:37 +00002314 if (size == -1) {
2315 size = wcslen(w);
2316 }
2317
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002318 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002319}
2320
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002321#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00002322
Victor Stinner15a11362012-10-06 23:48:20 +02002323/* maximum number of characters required for output of %lld or %p.
Victor Stinnere215d962012-10-06 23:03:36 +02002324 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2325 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2326#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
Victor Stinner96865452011-03-01 23:44:09 +00002327
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002328static int
2329unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2330 Py_ssize_t width, Py_ssize_t precision)
2331{
2332 Py_ssize_t length, fill, arglen;
2333 Py_UCS4 maxchar;
2334
2335 if (PyUnicode_READY(str) == -1)
2336 return -1;
2337
2338 length = PyUnicode_GET_LENGTH(str);
2339 if ((precision == -1 || precision >= length)
2340 && width <= length)
2341 return _PyUnicodeWriter_WriteStr(writer, str);
2342
2343 if (precision != -1)
2344 length = Py_MIN(precision, length);
2345
2346 arglen = Py_MAX(length, width);
2347 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2348 maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2349 else
2350 maxchar = writer->maxchar;
2351
2352 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2353 return -1;
2354
2355 if (width > length) {
2356 fill = width - length;
2357 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2358 return -1;
2359 writer->pos += fill;
2360 }
2361
2362 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2363 str, 0, length);
2364 writer->pos += length;
2365 return 0;
2366}
2367
2368static int
2369unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
2370 Py_ssize_t width, Py_ssize_t precision)
2371{
2372 /* UTF-8 */
2373 Py_ssize_t length;
2374 PyObject *unicode;
2375 int res;
2376
2377 length = strlen(str);
2378 if (precision != -1)
2379 length = Py_MIN(length, precision);
2380 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2381 if (unicode == NULL)
2382 return -1;
2383
2384 res = unicode_fromformat_write_str(writer, unicode, width, -1);
2385 Py_DECREF(unicode);
2386 return res;
2387}
2388
Victor Stinner96865452011-03-01 23:44:09 +00002389static const char*
Victor Stinnere215d962012-10-06 23:03:36 +02002390unicode_fromformat_arg(_PyUnicodeWriter *writer,
2391 const char *f, va_list *vargs)
Victor Stinner96865452011-03-01 23:44:09 +00002392{
Victor Stinnere215d962012-10-06 23:03:36 +02002393 const char *p;
2394 Py_ssize_t len;
2395 int zeropad;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002396 Py_ssize_t width;
2397 Py_ssize_t precision;
Victor Stinnere215d962012-10-06 23:03:36 +02002398 int longflag;
2399 int longlongflag;
2400 int size_tflag;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002401 Py_ssize_t fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002402
2403 p = f;
2404 f++;
Victor Stinner4c63a972012-10-06 23:55:33 +02002405 zeropad = 0;
2406 if (*f == '0') {
2407 zeropad = 1;
2408 f++;
2409 }
Victor Stinner96865452011-03-01 23:44:09 +00002410
2411 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002412 width = -1;
2413 if (Py_ISDIGIT((unsigned)*f)) {
2414 width = *f - '0';
Victor Stinner96865452011-03-01 23:44:09 +00002415 f++;
Victor Stinnere215d962012-10-06 23:03:36 +02002416 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002417 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
Victor Stinner3921e902012-10-06 23:05:00 +02002418 PyErr_SetString(PyExc_ValueError,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002419 "width too big");
Victor Stinner3921e902012-10-06 23:05:00 +02002420 return NULL;
2421 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002422 width = (width * 10) + (*f - '0');
Victor Stinnere215d962012-10-06 23:03:36 +02002423 f++;
2424 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002425 }
2426 precision = -1;
2427 if (*f == '.') {
2428 f++;
2429 if (Py_ISDIGIT((unsigned)*f)) {
2430 precision = (*f - '0');
2431 f++;
2432 while (Py_ISDIGIT((unsigned)*f)) {
2433 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2434 PyErr_SetString(PyExc_ValueError,
2435 "precision too big");
2436 return NULL;
2437 }
2438 precision = (precision * 10) + (*f - '0');
2439 f++;
2440 }
2441 }
Victor Stinner96865452011-03-01 23:44:09 +00002442 if (*f == '%') {
2443 /* "%.3%s" => f points to "3" */
2444 f--;
2445 }
2446 }
2447 if (*f == '\0') {
Victor Stinnere215d962012-10-06 23:03:36 +02002448 /* bogus format "%.123" => go backward, f points to "3" */
Victor Stinner96865452011-03-01 23:44:09 +00002449 f--;
2450 }
Victor Stinner96865452011-03-01 23:44:09 +00002451
2452 /* Handle %ld, %lu, %lld and %llu. */
2453 longflag = 0;
2454 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002455 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002456 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002457 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002458 longflag = 1;
2459 ++f;
2460 }
2461#ifdef HAVE_LONG_LONG
2462 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002463 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002464 longlongflag = 1;
2465 f += 2;
2466 }
2467#endif
2468 }
2469 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002470 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002471 size_tflag = 1;
2472 ++f;
2473 }
Victor Stinnere215d962012-10-06 23:03:36 +02002474
2475 if (f[1] == '\0')
2476 writer->overallocate = 0;
2477
2478 switch (*f) {
2479 case 'c':
2480 {
2481 int ordinal = va_arg(*vargs, int);
Victor Stinnerff5a8482012-10-06 23:05:45 +02002482 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Serhiy Storchakac89533f2013-06-23 20:21:16 +03002483 PyErr_SetString(PyExc_OverflowError,
Victor Stinnerff5a8482012-10-06 23:05:45 +02002484 "character argument not in range(0x110000)");
2485 return NULL;
2486 }
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002487 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002488 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002489 break;
2490 }
2491
2492 case 'i':
2493 case 'd':
2494 case 'u':
2495 case 'x':
2496 {
2497 /* used by sprintf */
Victor Stinner15a11362012-10-06 23:48:20 +02002498 char buffer[MAX_LONG_LONG_CHARS];
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002499 Py_ssize_t arglen;
Victor Stinnere215d962012-10-06 23:03:36 +02002500
2501 if (*f == 'u') {
Victor Stinnere215d962012-10-06 23:03:36 +02002502 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002503 len = sprintf(buffer, "%lu",
Victor Stinnere215d962012-10-06 23:03:36 +02002504 va_arg(*vargs, unsigned long));
2505#ifdef HAVE_LONG_LONG
2506 else if (longlongflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002507 len = sprintf(buffer, "%" PY_FORMAT_LONG_LONG "u",
Victor Stinnere215d962012-10-06 23:03:36 +02002508 va_arg(*vargs, unsigned PY_LONG_LONG));
2509#endif
2510 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002511 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "u",
Victor Stinnere215d962012-10-06 23:03:36 +02002512 va_arg(*vargs, size_t));
2513 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002514 len = sprintf(buffer, "%u",
Victor Stinnere215d962012-10-06 23:03:36 +02002515 va_arg(*vargs, unsigned int));
2516 }
2517 else if (*f == 'x') {
Victor Stinner3aa979e2014-11-18 21:40:51 +01002518 len = sprintf(buffer, "%x", va_arg(*vargs, int));
Victor Stinnere215d962012-10-06 23:03:36 +02002519 }
2520 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002521 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002522 len = sprintf(buffer, "%li",
Victor Stinnere215d962012-10-06 23:03:36 +02002523 va_arg(*vargs, long));
2524#ifdef HAVE_LONG_LONG
2525 else if (longlongflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002526 len = sprintf(buffer, "%" PY_FORMAT_LONG_LONG "i",
Victor Stinnere215d962012-10-06 23:03:36 +02002527 va_arg(*vargs, PY_LONG_LONG));
2528#endif
2529 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002530 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "i",
Victor Stinnere215d962012-10-06 23:03:36 +02002531 va_arg(*vargs, Py_ssize_t));
2532 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002533 len = sprintf(buffer, "%i",
Victor Stinnere215d962012-10-06 23:03:36 +02002534 va_arg(*vargs, int));
2535 }
2536 assert(len >= 0);
2537
Victor Stinnere215d962012-10-06 23:03:36 +02002538 if (precision < len)
2539 precision = len;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002540
2541 arglen = Py_MAX(precision, width);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002542 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2543 return NULL;
2544
Victor Stinnere215d962012-10-06 23:03:36 +02002545 if (width > precision) {
2546 Py_UCS4 fillchar;
2547 fill = width - precision;
2548 fillchar = zeropad?'0':' ';
Victor Stinner15a11362012-10-06 23:48:20 +02002549 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2550 return NULL;
2551 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002552 }
Victor Stinner15a11362012-10-06 23:48:20 +02002553 if (precision > len) {
Victor Stinnere215d962012-10-06 23:03:36 +02002554 fill = precision - len;
Victor Stinner15a11362012-10-06 23:48:20 +02002555 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2556 return NULL;
2557 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002558 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002559
Victor Stinner4a587072013-11-19 12:54:53 +01002560 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2561 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002562 break;
2563 }
2564
2565 case 'p':
2566 {
2567 char number[MAX_LONG_LONG_CHARS];
2568
2569 len = sprintf(number, "%p", va_arg(*vargs, void*));
2570 assert(len >= 0);
2571
2572 /* %p is ill-defined: ensure leading 0x. */
2573 if (number[1] == 'X')
2574 number[1] = 'x';
2575 else if (number[1] != 'x') {
2576 memmove(number + 2, number,
2577 strlen(number) + 1);
2578 number[0] = '0';
2579 number[1] = 'x';
2580 len += 2;
2581 }
2582
Victor Stinner4a587072013-11-19 12:54:53 +01002583 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002584 return NULL;
2585 break;
2586 }
2587
2588 case 's':
2589 {
2590 /* UTF-8 */
2591 const char *s = va_arg(*vargs, const char*);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002592 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002593 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002594 break;
2595 }
2596
2597 case 'U':
2598 {
2599 PyObject *obj = va_arg(*vargs, PyObject *);
2600 assert(obj && _PyUnicode_CHECK(obj));
2601
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002602 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002603 return NULL;
2604 break;
2605 }
2606
2607 case 'V':
2608 {
2609 PyObject *obj = va_arg(*vargs, PyObject *);
2610 const char *str = va_arg(*vargs, const char *);
Victor Stinnere215d962012-10-06 23:03:36 +02002611 if (obj) {
2612 assert(_PyUnicode_CHECK(obj));
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002613 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002614 return NULL;
2615 }
2616 else {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002617 assert(str != NULL);
2618 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002619 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002620 }
2621 break;
2622 }
2623
2624 case 'S':
2625 {
2626 PyObject *obj = va_arg(*vargs, PyObject *);
2627 PyObject *str;
2628 assert(obj);
2629 str = PyObject_Str(obj);
2630 if (!str)
2631 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002632 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002633 Py_DECREF(str);
2634 return NULL;
2635 }
2636 Py_DECREF(str);
2637 break;
2638 }
2639
2640 case 'R':
2641 {
2642 PyObject *obj = va_arg(*vargs, PyObject *);
2643 PyObject *repr;
2644 assert(obj);
2645 repr = PyObject_Repr(obj);
2646 if (!repr)
2647 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002648 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002649 Py_DECREF(repr);
2650 return NULL;
2651 }
2652 Py_DECREF(repr);
2653 break;
2654 }
2655
2656 case 'A':
2657 {
2658 PyObject *obj = va_arg(*vargs, PyObject *);
2659 PyObject *ascii;
2660 assert(obj);
2661 ascii = PyObject_ASCII(obj);
2662 if (!ascii)
2663 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002664 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002665 Py_DECREF(ascii);
2666 return NULL;
2667 }
2668 Py_DECREF(ascii);
2669 break;
2670 }
2671
2672 case '%':
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002673 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002674 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002675 break;
2676
2677 default:
2678 /* if we stumble upon an unknown formatting code, copy the rest
2679 of the format string to the output string. (we cannot just
2680 skip the code, since there's no way to know what's in the
2681 argument list) */
2682 len = strlen(p);
Victor Stinner4a587072013-11-19 12:54:53 +01002683 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002684 return NULL;
2685 f = p+len;
2686 return f;
2687 }
2688
2689 f++;
Victor Stinner96865452011-03-01 23:44:09 +00002690 return f;
2691}
2692
Walter Dörwaldd2034312007-05-18 16:29:38 +00002693PyObject *
2694PyUnicode_FromFormatV(const char *format, va_list vargs)
2695{
Victor Stinnere215d962012-10-06 23:03:36 +02002696 va_list vargs2;
2697 const char *f;
2698 _PyUnicodeWriter writer;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002699
Victor Stinner8f674cc2013-04-17 23:02:17 +02002700 _PyUnicodeWriter_Init(&writer);
2701 writer.min_length = strlen(format) + 100;
2702 writer.overallocate = 1;
Victor Stinnere215d962012-10-06 23:03:36 +02002703
2704 /* va_list may be an array (of 1 item) on some platforms (ex: AMD64).
2705 Copy it to be able to pass a reference to a subfunction. */
2706 Py_VA_COPY(vargs2, vargs);
2707
2708 for (f = format; *f; ) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002709 if (*f == '%') {
Victor Stinnere215d962012-10-06 23:03:36 +02002710 f = unicode_fromformat_arg(&writer, f, &vargs2);
2711 if (f == NULL)
2712 goto fail;
Victor Stinner1205f272010-09-11 00:54:47 +00002713 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002714 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002715 const char *p;
2716 Py_ssize_t len;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002717
Victor Stinnere215d962012-10-06 23:03:36 +02002718 p = f;
2719 do
2720 {
2721 if ((unsigned char)*p > 127) {
2722 PyErr_Format(PyExc_ValueError,
2723 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2724 "string, got a non-ASCII byte: 0x%02x",
2725 (unsigned char)*p);
Victor Stinner1ddf53d2016-09-21 14:13:14 +02002726 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02002727 }
2728 p++;
2729 }
2730 while (*p != '\0' && *p != '%');
2731 len = p - f;
2732
2733 if (*p == '\0')
2734 writer.overallocate = 0;
Victor Stinner4a587072013-11-19 12:54:53 +01002735
2736 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002737 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02002738
2739 f = p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002740 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002741 }
Victor Stinnere215d962012-10-06 23:03:36 +02002742 return _PyUnicodeWriter_Finish(&writer);
2743
2744 fail:
2745 _PyUnicodeWriter_Dealloc(&writer);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002746 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002747}
2748
Walter Dörwaldd2034312007-05-18 16:29:38 +00002749PyObject *
2750PyUnicode_FromFormat(const char *format, ...)
2751{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002752 PyObject* ret;
2753 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002754
2755#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002756 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002757#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002758 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002759#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002760 ret = PyUnicode_FromFormatV(format, vargs);
2761 va_end(vargs);
2762 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002763}
2764
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002765#ifdef HAVE_WCHAR_H
2766
Victor Stinner5593d8a2010-10-02 11:11:27 +00002767/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2768 convert a Unicode object to a wide character string.
2769
Victor Stinnerd88d9832011-09-06 02:00:05 +02002770 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002771 character) required to convert the unicode object. Ignore size argument.
2772
Victor Stinnerd88d9832011-09-06 02:00:05 +02002773 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002774 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002775 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002776static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002777unicode_aswidechar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002778 wchar_t *w,
2779 Py_ssize_t size)
2780{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002781 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002782 const wchar_t *wstr;
2783
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002784 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002785 if (wstr == NULL)
2786 return -1;
2787
Victor Stinner5593d8a2010-10-02 11:11:27 +00002788 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002789 if (size > res)
2790 size = res + 1;
2791 else
2792 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002793 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002794 return res;
2795 }
2796 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002797 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002798}
2799
2800Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002801PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002802 wchar_t *w,
2803 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002804{
2805 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002806 PyErr_BadInternalCall();
2807 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002808 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002809 return unicode_aswidechar(unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002810}
2811
Victor Stinner137c34c2010-09-29 10:25:54 +00002812wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002813PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002814 Py_ssize_t *size)
2815{
2816 wchar_t* buffer;
2817 Py_ssize_t buflen;
2818
2819 if (unicode == NULL) {
2820 PyErr_BadInternalCall();
2821 return NULL;
2822 }
2823
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002824 buflen = unicode_aswidechar(unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002825 if (buflen == -1)
2826 return NULL;
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002827 buffer = PyMem_NEW(wchar_t, buflen);
Victor Stinner137c34c2010-09-29 10:25:54 +00002828 if (buffer == NULL) {
2829 PyErr_NoMemory();
2830 return NULL;
2831 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002832 buflen = unicode_aswidechar(unicode, buffer, buflen);
Stefan Krah8528c312012-08-19 21:52:43 +02002833 if (buflen == -1) {
2834 PyMem_FREE(buffer);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002835 return NULL;
Stefan Krah8528c312012-08-19 21:52:43 +02002836 }
Victor Stinner5593d8a2010-10-02 11:11:27 +00002837 if (size != NULL)
2838 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002839 return buffer;
2840}
2841
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002842#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002843
Alexander Belopolsky40018472011-02-26 01:02:56 +00002844PyObject *
2845PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002846{
Victor Stinner8faf8212011-12-08 22:14:11 +01002847 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002848 PyErr_SetString(PyExc_ValueError,
2849 "chr() arg not in range(0x110000)");
2850 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002851 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002852
Victor Stinner985a82a2014-01-03 12:53:47 +01002853 return unicode_char((Py_UCS4)ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002854}
2855
Alexander Belopolsky40018472011-02-26 01:02:56 +00002856PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02002857PyUnicode_FromObject(PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002858{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002859 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002860 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002861 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05002862 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002863 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002864 Py_INCREF(obj);
2865 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002866 }
2867 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002868 /* For a Unicode subtype that's not a Unicode object,
2869 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002870 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002871 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002872 PyErr_Format(PyExc_TypeError,
2873 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002874 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002875 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002876}
2877
Alexander Belopolsky40018472011-02-26 01:02:56 +00002878PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02002879PyUnicode_FromEncodedObject(PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002880 const char *encoding,
2881 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002882{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002883 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002884 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002885
Guido van Rossumd57fd912000-03-10 22:53:23 +00002886 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002887 PyErr_BadInternalCall();
2888 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002889 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002890
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002891 /* Decoding bytes objects is the most common case and should be fast */
2892 if (PyBytes_Check(obj)) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02002893 if (PyBytes_GET_SIZE(obj) == 0)
2894 _Py_RETURN_UNICODE_EMPTY();
2895 v = PyUnicode_Decode(
2896 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2897 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002898 return v;
2899 }
2900
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002901 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002902 PyErr_SetString(PyExc_TypeError,
2903 "decoding str is not supported");
2904 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002905 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002906
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002907 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2908 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2909 PyErr_Format(PyExc_TypeError,
Serhiy Storchakab757c832014-12-05 22:25:22 +02002910 "coercing to str: need a bytes-like object, %.80s found",
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002911 Py_TYPE(obj)->tp_name);
2912 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002913 }
Tim Petersced69f82003-09-16 20:30:58 +00002914
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002915 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02002916 PyBuffer_Release(&buffer);
2917 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00002918 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002919
Serhiy Storchaka05997252013-01-26 12:14:02 +02002920 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002921 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002922 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002923}
2924
Victor Stinner600d3be2010-06-10 12:00:55 +00002925/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002926 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2927 1 on success. */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01002928int
2929_Py_normalize_encoding(const char *encoding,
2930 char *lower,
2931 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002932{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002933 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002934 char *l;
2935 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002936
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04002937 if (encoding == NULL) {
Victor Stinner66b32702013-11-07 23:12:23 +01002938 /* 6 == strlen("utf-8") + 1 */
Victor Stinnerdf23e302013-11-07 13:33:36 +01002939 if (lower_len < 6)
2940 return 0;
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04002941 strcpy(lower, "utf-8");
2942 return 1;
2943 }
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002944 e = encoding;
2945 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002946 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002947 while (*e) {
2948 if (l == l_end)
2949 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002950 if (Py_ISUPPER(*e)) {
2951 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002952 }
2953 else if (*e == '_') {
2954 *l++ = '-';
2955 e++;
2956 }
2957 else {
2958 *l++ = *e++;
2959 }
2960 }
2961 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002962 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002963}
2964
Alexander Belopolsky40018472011-02-26 01:02:56 +00002965PyObject *
2966PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002967 Py_ssize_t size,
2968 const char *encoding,
2969 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00002970{
2971 PyObject *buffer = NULL, *unicode;
2972 Py_buffer info;
2973 char lower[11]; /* Enough for any encoding shortcut */
2974
Fred Drakee4315f52000-05-09 19:53:39 +00002975 /* Shortcuts for common default encodings */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01002976 if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002977 if ((strcmp(lower, "utf-8") == 0) ||
2978 (strcmp(lower, "utf8") == 0))
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002979 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
Victor Stinner37296e82010-06-10 13:36:23 +00002980 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002981 (strcmp(lower, "latin1") == 0) ||
Victor Stinnerfa3ba4c2013-10-29 11:34:05 +01002982 (strcmp(lower, "iso-8859-1") == 0) ||
2983 (strcmp(lower, "iso8859-1") == 0))
Victor Stinner37296e82010-06-10 13:36:23 +00002984 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002985#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002986 else if (strcmp(lower, "mbcs") == 0)
2987 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002988#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002989 else if (strcmp(lower, "ascii") == 0)
2990 return PyUnicode_DecodeASCII(s, size, errors);
2991 else if (strcmp(lower, "utf-16") == 0)
2992 return PyUnicode_DecodeUTF16(s, size, errors, 0);
2993 else if (strcmp(lower, "utf-32") == 0)
2994 return PyUnicode_DecodeUTF32(s, size, errors, 0);
2995 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002996
2997 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002998 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00002999 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003000 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003001 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003002 if (buffer == NULL)
3003 goto onError;
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003004 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003005 if (unicode == NULL)
3006 goto onError;
3007 if (!PyUnicode_Check(unicode)) {
3008 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003009 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3010 "use codecs.decode() to decode to arbitrary types",
3011 encoding,
Benjamin Peterson8d761ff2016-10-16 15:41:46 -07003012 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003013 Py_DECREF(unicode);
3014 goto onError;
3015 }
3016 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003017 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003018
Benjamin Peterson29060642009-01-31 22:14:21 +00003019 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003020 Py_XDECREF(buffer);
3021 return NULL;
3022}
3023
Alexander Belopolsky40018472011-02-26 01:02:56 +00003024PyObject *
3025PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003026 const char *encoding,
3027 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003028{
3029 PyObject *v;
3030
3031 if (!PyUnicode_Check(unicode)) {
3032 PyErr_BadArgument();
3033 goto onError;
3034 }
3035
3036 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003037 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003038
3039 /* Decode via the codec registry */
3040 v = PyCodec_Decode(unicode, encoding, errors);
3041 if (v == NULL)
3042 goto onError;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003043 return unicode_result(v);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003044
Benjamin Peterson29060642009-01-31 22:14:21 +00003045 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003046 return NULL;
3047}
3048
Alexander Belopolsky40018472011-02-26 01:02:56 +00003049PyObject *
3050PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003051 const char *encoding,
3052 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003053{
3054 PyObject *v;
3055
3056 if (!PyUnicode_Check(unicode)) {
3057 PyErr_BadArgument();
3058 goto onError;
3059 }
3060
3061 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003062 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003063
3064 /* Decode via the codec registry */
3065 v = PyCodec_Decode(unicode, encoding, errors);
3066 if (v == NULL)
3067 goto onError;
3068 if (!PyUnicode_Check(v)) {
3069 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003070 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3071 "use codecs.decode() to decode to arbitrary types",
3072 encoding,
Benjamin Peterson8d761ff2016-10-16 15:41:46 -07003073 Py_TYPE(unicode)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003074 Py_DECREF(v);
3075 goto onError;
3076 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003077 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003078
Benjamin Peterson29060642009-01-31 22:14:21 +00003079 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003080 return NULL;
3081}
3082
Alexander Belopolsky40018472011-02-26 01:02:56 +00003083PyObject *
3084PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003085 Py_ssize_t size,
3086 const char *encoding,
3087 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003088{
3089 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003090
Guido van Rossumd57fd912000-03-10 22:53:23 +00003091 unicode = PyUnicode_FromUnicode(s, size);
3092 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003093 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003094 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3095 Py_DECREF(unicode);
3096 return v;
3097}
3098
Alexander Belopolsky40018472011-02-26 01:02:56 +00003099PyObject *
3100PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003101 const char *encoding,
3102 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003103{
3104 PyObject *v;
3105
3106 if (!PyUnicode_Check(unicode)) {
3107 PyErr_BadArgument();
3108 goto onError;
3109 }
3110
3111 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003112 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003113
3114 /* Encode via the codec registry */
3115 v = PyCodec_Encode(unicode, encoding, errors);
3116 if (v == NULL)
3117 goto onError;
3118 return v;
3119
Benjamin Peterson29060642009-01-31 22:14:21 +00003120 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003121 return NULL;
3122}
3123
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003124static size_t
3125wcstombs_errorpos(const wchar_t *wstr)
3126{
3127 size_t len;
3128#if SIZEOF_WCHAR_T == 2
3129 wchar_t buf[3];
3130#else
3131 wchar_t buf[2];
3132#endif
3133 char outbuf[MB_LEN_MAX];
3134 const wchar_t *start, *previous;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003135
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003136#if SIZEOF_WCHAR_T == 2
3137 buf[2] = 0;
3138#else
3139 buf[1] = 0;
3140#endif
3141 start = wstr;
3142 while (*wstr != L'\0')
3143 {
3144 previous = wstr;
3145#if SIZEOF_WCHAR_T == 2
3146 if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
3147 && Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
3148 {
3149 buf[0] = wstr[0];
3150 buf[1] = wstr[1];
3151 wstr += 2;
3152 }
3153 else {
3154 buf[0] = *wstr;
3155 buf[1] = 0;
3156 wstr++;
3157 }
3158#else
3159 buf[0] = *wstr;
3160 wstr++;
3161#endif
3162 len = wcstombs(outbuf, buf, sizeof(outbuf));
Victor Stinner2f197072011-12-17 07:08:30 +01003163 if (len == (size_t)-1)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003164 return previous - start;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003165 }
3166
3167 /* failed to find the unencodable character */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003168 return 0;
3169}
3170
Victor Stinner1b579672011-12-17 05:47:23 +01003171static int
3172locale_error_handler(const char *errors, int *surrogateescape)
3173{
3174 if (errors == NULL) {
3175 *surrogateescape = 0;
3176 return 0;
3177 }
3178
3179 if (strcmp(errors, "strict") == 0) {
3180 *surrogateescape = 0;
3181 return 0;
3182 }
Victor Stinner8dbd4212012-12-04 09:30:24 +01003183 if (strcmp(errors, "surrogateescape") == 0) {
Victor Stinner1b579672011-12-17 05:47:23 +01003184 *surrogateescape = 1;
3185 return 0;
3186 }
3187 PyErr_Format(PyExc_ValueError,
3188 "only 'strict' and 'surrogateescape' error handlers "
3189 "are supported, not '%s'",
3190 errors);
3191 return -1;
3192}
3193
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003194PyObject *
Victor Stinner1b579672011-12-17 05:47:23 +01003195PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003196{
3197 Py_ssize_t wlen, wlen2;
3198 wchar_t *wstr;
3199 PyObject *bytes = NULL;
3200 char *errmsg;
Raymond Hettingere56666d2013-08-04 11:51:03 -07003201 PyObject *reason = NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003202 PyObject *exc;
3203 size_t error_pos;
Victor Stinner1b579672011-12-17 05:47:23 +01003204 int surrogateescape;
3205
3206 if (locale_error_handler(errors, &surrogateescape) < 0)
3207 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003208
3209 wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3210 if (wstr == NULL)
3211 return NULL;
3212
3213 wlen2 = wcslen(wstr);
3214 if (wlen2 != wlen) {
3215 PyMem_Free(wstr);
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003216 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003217 return NULL;
3218 }
3219
3220 if (surrogateescape) {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003221 /* "surrogateescape" error handler */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003222 char *str;
3223
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003224 str = Py_EncodeLocale(wstr, &error_pos);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003225 if (str == NULL) {
3226 if (error_pos == (size_t)-1) {
3227 PyErr_NoMemory();
3228 PyMem_Free(wstr);
3229 return NULL;
3230 }
3231 else {
3232 goto encode_error;
3233 }
3234 }
3235 PyMem_Free(wstr);
3236
3237 bytes = PyBytes_FromString(str);
3238 PyMem_Free(str);
3239 }
3240 else {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003241 /* strict mode */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003242 size_t len, len2;
3243
3244 len = wcstombs(NULL, wstr, 0);
3245 if (len == (size_t)-1) {
Victor Stinner2f197072011-12-17 07:08:30 +01003246 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003247 goto encode_error;
3248 }
3249
3250 bytes = PyBytes_FromStringAndSize(NULL, len);
3251 if (bytes == NULL) {
3252 PyMem_Free(wstr);
3253 return NULL;
3254 }
3255
3256 len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
3257 if (len2 == (size_t)-1 || len2 > len) {
Victor Stinner2f197072011-12-17 07:08:30 +01003258 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003259 goto encode_error;
3260 }
3261 PyMem_Free(wstr);
3262 }
3263 return bytes;
3264
3265encode_error:
3266 errmsg = strerror(errno);
3267 assert(errmsg != NULL);
Victor Stinner2f197072011-12-17 07:08:30 +01003268
3269 if (error_pos == (size_t)-1)
3270 error_pos = wcstombs_errorpos(wstr);
3271
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003272 PyMem_Free(wstr);
3273 Py_XDECREF(bytes);
3274
Victor Stinner2f197072011-12-17 07:08:30 +01003275 if (errmsg != NULL) {
3276 size_t errlen;
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003277 wstr = Py_DecodeLocale(errmsg, &errlen);
Victor Stinner2f197072011-12-17 07:08:30 +01003278 if (wstr != NULL) {
3279 reason = PyUnicode_FromWideChar(wstr, errlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003280 PyMem_RawFree(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003281 } else
3282 errmsg = NULL;
3283 }
3284 if (errmsg == NULL)
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003285 reason = PyUnicode_FromString(
3286 "wcstombs() encountered an unencodable "
3287 "wide character");
3288 if (reason == NULL)
3289 return NULL;
3290
3291 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO",
3292 "locale", unicode,
3293 (Py_ssize_t)error_pos,
3294 (Py_ssize_t)(error_pos+1),
3295 reason);
3296 Py_DECREF(reason);
3297 if (exc != NULL) {
3298 PyCodec_StrictErrors(exc);
3299 Py_XDECREF(exc);
3300 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003301 return NULL;
3302}
3303
Victor Stinnerad158722010-10-27 00:25:46 +00003304PyObject *
3305PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003306{
Victor Stinner99b95382011-07-04 14:23:54 +02003307#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003308 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003309#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003310 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00003311#else
Victor Stinner793b5312011-04-27 00:24:21 +02003312 PyInterpreterState *interp = PyThreadState_GET()->interp;
3313 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3314 cannot use it to encode and decode filenames before it is loaded. Load
3315 the Python codec requires to encode at least its own filename. Use the C
3316 version of the locale codec until the codec registry is initialized and
3317 the Python codec is loaded.
3318
3319 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3320 cannot only rely on it: check also interp->fscodec_initialized for
3321 subinterpreters. */
3322 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003323 return PyUnicode_AsEncodedString(unicode,
3324 Py_FileSystemDefaultEncoding,
3325 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003326 }
3327 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003328 return PyUnicode_EncodeLocale(unicode, "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003329 }
Victor Stinnerad158722010-10-27 00:25:46 +00003330#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003331}
3332
Alexander Belopolsky40018472011-02-26 01:02:56 +00003333PyObject *
3334PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003335 const char *encoding,
3336 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003337{
3338 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00003339 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003340
Guido van Rossumd57fd912000-03-10 22:53:23 +00003341 if (!PyUnicode_Check(unicode)) {
3342 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003343 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003344 }
Fred Drakee4315f52000-05-09 19:53:39 +00003345
Fred Drakee4315f52000-05-09 19:53:39 +00003346 /* Shortcuts for common default encodings */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003347 if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003348 if ((strcmp(lower, "utf-8") == 0) ||
3349 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00003350 {
Victor Stinner2f283c22011-03-02 01:21:46 +00003351 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003352 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00003353 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003354 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00003355 }
Victor Stinner37296e82010-06-10 13:36:23 +00003356 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003357 (strcmp(lower, "latin1") == 0) ||
Victor Stinnerfa3ba4c2013-10-29 11:34:05 +01003358 (strcmp(lower, "iso-8859-1") == 0) ||
3359 (strcmp(lower, "iso8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003360 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003361#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003362 else if (strcmp(lower, "mbcs") == 0)
3363 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003364#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003365 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003366 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00003367 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003368
3369 /* Encode via the codec registry */
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003370 v = _PyCodec_EncodeText(unicode, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003371 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003372 return NULL;
3373
3374 /* The normal path */
3375 if (PyBytes_Check(v))
3376 return v;
3377
3378 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003379 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003380 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003381 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003382
3383 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003384 "encoder %s returned bytearray instead of bytes; "
3385 "use codecs.encode() to encode to arbitrary types",
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003386 encoding);
3387 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003388 Py_DECREF(v);
3389 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003390 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003391
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003392 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3393 Py_DECREF(v);
3394 return b;
3395 }
3396
3397 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003398 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
3399 "use codecs.encode() to encode to arbitrary types",
3400 encoding,
Benjamin Peterson8d761ff2016-10-16 15:41:46 -07003401 Py_TYPE(v)->tp_name);
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003402 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003403 return NULL;
3404}
3405
Alexander Belopolsky40018472011-02-26 01:02:56 +00003406PyObject *
3407PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003408 const char *encoding,
3409 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003410{
3411 PyObject *v;
3412
3413 if (!PyUnicode_Check(unicode)) {
3414 PyErr_BadArgument();
3415 goto onError;
3416 }
3417
3418 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003419 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003420
3421 /* Encode via the codec registry */
3422 v = PyCodec_Encode(unicode, encoding, errors);
3423 if (v == NULL)
3424 goto onError;
3425 if (!PyUnicode_Check(v)) {
3426 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003427 "'%.400s' encoder returned '%.400s' instead of 'str'; "
3428 "use codecs.encode() to encode to arbitrary types",
3429 encoding,
Benjamin Peterson8d761ff2016-10-16 15:41:46 -07003430 Py_TYPE(v)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003431 Py_DECREF(v);
3432 goto onError;
3433 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003434 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003435
Benjamin Peterson29060642009-01-31 22:14:21 +00003436 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003437 return NULL;
3438}
3439
Victor Stinner2f197072011-12-17 07:08:30 +01003440static size_t
3441mbstowcs_errorpos(const char *str, size_t len)
3442{
3443#ifdef HAVE_MBRTOWC
3444 const char *start = str;
3445 mbstate_t mbs;
3446 size_t converted;
3447 wchar_t ch;
3448
3449 memset(&mbs, 0, sizeof mbs);
3450 while (len)
3451 {
Serhiy Storchaka20b39b22014-09-28 11:27:24 +03003452 converted = mbrtowc(&ch, str, len, &mbs);
Victor Stinner2f197072011-12-17 07:08:30 +01003453 if (converted == 0)
3454 /* Reached end of string */
3455 break;
3456 if (converted == (size_t)-1 || converted == (size_t)-2) {
3457 /* Conversion error or incomplete character */
3458 return str - start;
3459 }
3460 else {
3461 str += converted;
3462 len -= converted;
3463 }
3464 }
3465 /* failed to find the undecodable byte sequence */
3466 return 0;
3467#endif
3468 return 0;
3469}
3470
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003471PyObject*
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003472PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
Victor Stinner1b579672011-12-17 05:47:23 +01003473 const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003474{
3475 wchar_t smallbuf[256];
3476 size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
3477 wchar_t *wstr;
3478 size_t wlen, wlen2;
3479 PyObject *unicode;
Victor Stinner1b579672011-12-17 05:47:23 +01003480 int surrogateescape;
Victor Stinner2f197072011-12-17 07:08:30 +01003481 size_t error_pos;
3482 char *errmsg;
Victor Stinner0c39b1b2015-03-18 15:02:06 +01003483 PyObject *reason = NULL; /* initialize to prevent gcc warning */
3484 PyObject *exc;
Victor Stinner1b579672011-12-17 05:47:23 +01003485
3486 if (locale_error_handler(errors, &surrogateescape) < 0)
3487 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003488
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003489 if (str[len] != '\0' || (size_t)len != strlen(str)) {
3490 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003491 return NULL;
3492 }
3493
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003494 if (surrogateescape) {
3495 /* "surrogateescape" error handler */
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003496 wstr = Py_DecodeLocale(str, &wlen);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003497 if (wstr == NULL) {
3498 if (wlen == (size_t)-1)
3499 PyErr_NoMemory();
3500 else
3501 PyErr_SetFromErrno(PyExc_OSError);
3502 return NULL;
3503 }
3504
3505 unicode = PyUnicode_FromWideChar(wstr, wlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003506 PyMem_RawFree(wstr);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003507 }
3508 else {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003509 /* strict mode */
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003510#ifndef HAVE_BROKEN_MBSTOWCS
3511 wlen = mbstowcs(NULL, str, 0);
3512#else
3513 wlen = len;
3514#endif
Victor Stinner2f197072011-12-17 07:08:30 +01003515 if (wlen == (size_t)-1)
3516 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003517 if (wlen+1 <= smallbuf_len) {
3518 wstr = smallbuf;
3519 }
3520 else {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02003521 wstr = PyMem_New(wchar_t, wlen+1);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003522 if (!wstr)
3523 return PyErr_NoMemory();
3524 }
3525
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003526 wlen2 = mbstowcs(wstr, str, wlen+1);
3527 if (wlen2 == (size_t)-1) {
3528 if (wstr != smallbuf)
3529 PyMem_Free(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003530 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003531 }
3532#ifdef HAVE_BROKEN_MBSTOWCS
3533 assert(wlen2 == wlen);
3534#endif
3535 unicode = PyUnicode_FromWideChar(wstr, wlen2);
3536 if (wstr != smallbuf)
3537 PyMem_Free(wstr);
3538 }
3539 return unicode;
Victor Stinner2f197072011-12-17 07:08:30 +01003540
3541decode_error:
Antoine Pitrouf6d1f1f2015-05-19 21:04:33 +02003542 reason = NULL;
Victor Stinner2f197072011-12-17 07:08:30 +01003543 errmsg = strerror(errno);
3544 assert(errmsg != NULL);
3545
3546 error_pos = mbstowcs_errorpos(str, len);
3547 if (errmsg != NULL) {
3548 size_t errlen;
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003549 wstr = Py_DecodeLocale(errmsg, &errlen);
Victor Stinner2f197072011-12-17 07:08:30 +01003550 if (wstr != NULL) {
3551 reason = PyUnicode_FromWideChar(wstr, errlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003552 PyMem_RawFree(wstr);
Antoine Pitrouf6d1f1f2015-05-19 21:04:33 +02003553 }
Victor Stinner2f197072011-12-17 07:08:30 +01003554 }
Antoine Pitrouf6d1f1f2015-05-19 21:04:33 +02003555 if (reason == NULL)
Victor Stinner2f197072011-12-17 07:08:30 +01003556 reason = PyUnicode_FromString(
3557 "mbstowcs() encountered an invalid multibyte sequence");
3558 if (reason == NULL)
3559 return NULL;
3560
3561 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO",
3562 "locale", str, len,
3563 (Py_ssize_t)error_pos,
3564 (Py_ssize_t)(error_pos+1),
3565 reason);
3566 Py_DECREF(reason);
3567 if (exc != NULL) {
3568 PyCodec_StrictErrors(exc);
3569 Py_XDECREF(exc);
3570 }
3571 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003572}
3573
3574PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003575PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003576{
3577 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner1b579672011-12-17 05:47:23 +01003578 return PyUnicode_DecodeLocaleAndSize(str, size, errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003579}
3580
3581
3582PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003583PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003584 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003585 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3586}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003587
Christian Heimes5894ba72007-11-04 11:43:14 +00003588PyObject*
3589PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3590{
Victor Stinner99b95382011-07-04 14:23:54 +02003591#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003592 return PyUnicode_DecodeMBCS(s, size, NULL);
3593#elif defined(__APPLE__)
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003594 return PyUnicode_DecodeUTF8Stateful(s, size, "surrogateescape", NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003595#else
Victor Stinner793b5312011-04-27 00:24:21 +02003596 PyInterpreterState *interp = PyThreadState_GET()->interp;
3597 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3598 cannot use it to encode and decode filenames before it is loaded. Load
3599 the Python codec requires to encode at least its own filename. Use the C
3600 version of the locale codec until the codec registry is initialized and
3601 the Python codec is loaded.
3602
3603 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3604 cannot only rely on it: check also interp->fscodec_initialized for
3605 subinterpreters. */
3606 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003607 return PyUnicode_Decode(s, size,
3608 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00003609 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003610 }
3611 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003612 return PyUnicode_DecodeLocaleAndSize(s, size, "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003613 }
Victor Stinnerad158722010-10-27 00:25:46 +00003614#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003615}
3616
Martin v. Löwis011e8422009-05-05 04:43:17 +00003617
3618int
3619PyUnicode_FSConverter(PyObject* arg, void* addr)
3620{
3621 PyObject *output = NULL;
3622 Py_ssize_t size;
3623 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003624 if (arg == NULL) {
3625 Py_DECREF(*(PyObject**)addr);
Benjamin Petersona4d33b32015-11-15 21:57:39 -08003626 *(PyObject**)addr = NULL;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003627 return 1;
3628 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00003629 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00003630 output = arg;
3631 Py_INCREF(output);
3632 }
3633 else {
3634 arg = PyUnicode_FromObject(arg);
3635 if (!arg)
3636 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00003637 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003638 Py_DECREF(arg);
3639 if (!output)
3640 return 0;
3641 if (!PyBytes_Check(output)) {
3642 Py_DECREF(output);
3643 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3644 return 0;
3645 }
3646 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00003647 size = PyBytes_GET_SIZE(output);
3648 data = PyBytes_AS_STRING(output);
Victor Stinner12174a52014-08-15 23:17:38 +02003649 if ((size_t)size != strlen(data)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003650 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003651 Py_DECREF(output);
3652 return 0;
3653 }
3654 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003655 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003656}
3657
3658
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003659int
3660PyUnicode_FSDecoder(PyObject* arg, void* addr)
3661{
3662 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003663 if (arg == NULL) {
3664 Py_DECREF(*(PyObject**)addr);
3665 return 1;
3666 }
3667 if (PyUnicode_Check(arg)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003668 if (PyUnicode_READY(arg) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003669 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003670 output = arg;
3671 Py_INCREF(output);
3672 }
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003673 else if (PyObject_CheckBuffer(arg)) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003674 arg = PyBytes_FromObject(arg);
3675 if (!arg)
3676 return 0;
3677 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3678 PyBytes_GET_SIZE(arg));
3679 Py_DECREF(arg);
3680 if (!output)
3681 return 0;
3682 if (!PyUnicode_Check(output)) {
3683 Py_DECREF(output);
3684 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3685 return 0;
3686 }
3687 }
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003688 else {
3689 PyErr_Format(PyExc_TypeError,
3690 "path should be string or bytes, not %.200s",
3691 Py_TYPE(arg)->tp_name);
3692 return 0;
3693 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003694 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003695 Py_DECREF(output);
3696 return 0;
3697 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003698 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003699 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003700 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003701 Py_DECREF(output);
3702 return 0;
3703 }
3704 *(PyObject**)addr = output;
3705 return Py_CLEANUP_SUPPORTED;
3706}
3707
3708
Martin v. Löwis5b222132007-06-10 09:51:05 +00003709char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003710PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003711{
Christian Heimesf3863112007-11-22 07:46:41 +00003712 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003713
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003714 if (!PyUnicode_Check(unicode)) {
3715 PyErr_BadArgument();
3716 return NULL;
3717 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003718 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003719 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003720
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003721 if (PyUnicode_UTF8(unicode) == NULL) {
3722 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003723 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3724 if (bytes == NULL)
3725 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003726 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3727 if (_PyUnicode_UTF8(unicode) == NULL) {
Victor Stinnera5afb582013-10-29 01:28:23 +01003728 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003729 Py_DECREF(bytes);
3730 return NULL;
3731 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003732 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3733 Py_MEMCPY(_PyUnicode_UTF8(unicode),
3734 PyBytes_AS_STRING(bytes),
3735 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003736 Py_DECREF(bytes);
3737 }
3738
3739 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003740 *psize = PyUnicode_UTF8_LENGTH(unicode);
3741 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003742}
3743
3744char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003745PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003746{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003747 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3748}
3749
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003750Py_UNICODE *
3751PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3752{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003753 const unsigned char *one_byte;
3754#if SIZEOF_WCHAR_T == 4
3755 const Py_UCS2 *two_bytes;
3756#else
3757 const Py_UCS4 *four_bytes;
3758 const Py_UCS4 *ucs4_end;
3759 Py_ssize_t num_surrogates;
3760#endif
3761 wchar_t *w;
3762 wchar_t *wchar_end;
3763
3764 if (!PyUnicode_Check(unicode)) {
3765 PyErr_BadArgument();
3766 return NULL;
3767 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003768 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003769 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003770 assert(_PyUnicode_KIND(unicode) != 0);
3771 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003772
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003773 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003774#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003775 four_bytes = PyUnicode_4BYTE_DATA(unicode);
3776 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003777 num_surrogates = 0;
3778
3779 for (; four_bytes < ucs4_end; ++four_bytes) {
3780 if (*four_bytes > 0xFFFF)
3781 ++num_surrogates;
3782 }
3783
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003784 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
3785 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
3786 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003787 PyErr_NoMemory();
3788 return NULL;
3789 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003790 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003791
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003792 w = _PyUnicode_WSTR(unicode);
3793 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
3794 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003795 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3796 if (*four_bytes > 0xFFFF) {
Victor Stinner8faf8212011-12-08 22:14:11 +01003797 assert(*four_bytes <= MAX_UNICODE);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003798 /* encode surrogate pair in this case */
Victor Stinner551ac952011-11-29 22:58:13 +01003799 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
3800 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003801 }
3802 else
3803 *w = *four_bytes;
3804
3805 if (w > wchar_end) {
3806 assert(0 && "Miscalculated string end");
3807 }
3808 }
3809 *w = 0;
3810#else
3811 /* sizeof(wchar_t) == 4 */
3812 Py_FatalError("Impossible unicode object state, wstr and str "
3813 "should share memory already.");
3814 return NULL;
3815#endif
3816 }
3817 else {
Serhiy Storchakae55181f2015-02-20 21:34:06 +02003818 if ((size_t)_PyUnicode_LENGTH(unicode) >
3819 PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
3820 PyErr_NoMemory();
3821 return NULL;
3822 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003823 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3824 (_PyUnicode_LENGTH(unicode) + 1));
3825 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003826 PyErr_NoMemory();
3827 return NULL;
3828 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003829 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
3830 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
3831 w = _PyUnicode_WSTR(unicode);
3832 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003833
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003834 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3835 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003836 for (; w < wchar_end; ++one_byte, ++w)
3837 *w = *one_byte;
3838 /* null-terminate the wstr */
3839 *w = 0;
3840 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003841 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003842#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003843 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003844 for (; w < wchar_end; ++two_bytes, ++w)
3845 *w = *two_bytes;
3846 /* null-terminate the wstr */
3847 *w = 0;
3848#else
3849 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003850 PyObject_FREE(_PyUnicode_WSTR(unicode));
3851 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003852 Py_FatalError("Impossible unicode object state, wstr "
3853 "and str should share memory already.");
3854 return NULL;
3855#endif
3856 }
3857 else {
3858 assert(0 && "This should never happen.");
3859 }
3860 }
3861 }
3862 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003863 *size = PyUnicode_WSTR_LENGTH(unicode);
3864 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003865}
3866
Alexander Belopolsky40018472011-02-26 01:02:56 +00003867Py_UNICODE *
3868PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003869{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003870 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003871}
3872
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003873
Alexander Belopolsky40018472011-02-26 01:02:56 +00003874Py_ssize_t
3875PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003876{
3877 if (!PyUnicode_Check(unicode)) {
3878 PyErr_BadArgument();
3879 goto onError;
3880 }
3881 return PyUnicode_GET_SIZE(unicode);
3882
Benjamin Peterson29060642009-01-31 22:14:21 +00003883 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003884 return -1;
3885}
3886
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003887Py_ssize_t
3888PyUnicode_GetLength(PyObject *unicode)
3889{
Victor Stinner07621332012-06-16 04:53:46 +02003890 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003891 PyErr_BadArgument();
3892 return -1;
3893 }
Victor Stinner07621332012-06-16 04:53:46 +02003894 if (PyUnicode_READY(unicode) == -1)
3895 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003896 return PyUnicode_GET_LENGTH(unicode);
3897}
3898
3899Py_UCS4
3900PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3901{
Victor Stinner69ed0f42013-04-09 21:48:24 +02003902 void *data;
3903 int kind;
3904
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003905 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3906 PyErr_BadArgument();
3907 return (Py_UCS4)-1;
3908 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01003909 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003910 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003911 return (Py_UCS4)-1;
3912 }
Victor Stinner69ed0f42013-04-09 21:48:24 +02003913 data = PyUnicode_DATA(unicode);
3914 kind = PyUnicode_KIND(unicode);
3915 return PyUnicode_READ(kind, data, index);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003916}
3917
3918int
3919PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3920{
3921 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003922 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003923 return -1;
3924 }
Victor Stinner488fa492011-12-12 00:01:39 +01003925 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01003926 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003927 PyErr_SetString(PyExc_IndexError, "string index out of range");
3928 return -1;
3929 }
Victor Stinner488fa492011-12-12 00:01:39 +01003930 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02003931 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01003932 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
3933 PyErr_SetString(PyExc_ValueError, "character out of range");
3934 return -1;
3935 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003936 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3937 index, ch);
3938 return 0;
3939}
3940
Alexander Belopolsky40018472011-02-26 01:02:56 +00003941const char *
3942PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00003943{
Victor Stinner42cb4622010-09-01 19:39:01 +00003944 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00003945}
3946
Victor Stinner554f3f02010-06-16 23:33:54 +00003947/* create or adjust a UnicodeDecodeError */
3948static void
3949make_decode_exception(PyObject **exceptionObject,
3950 const char *encoding,
3951 const char *input, Py_ssize_t length,
3952 Py_ssize_t startpos, Py_ssize_t endpos,
3953 const char *reason)
3954{
3955 if (*exceptionObject == NULL) {
3956 *exceptionObject = PyUnicodeDecodeError_Create(
3957 encoding, input, length, startpos, endpos, reason);
3958 }
3959 else {
3960 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
3961 goto onError;
3962 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
3963 goto onError;
3964 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
3965 goto onError;
3966 }
3967 return;
3968
3969onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02003970 Py_CLEAR(*exceptionObject);
Victor Stinner554f3f02010-06-16 23:33:54 +00003971}
3972
Victor Stinnerfc009ef2012-11-07 00:36:38 +01003973#ifdef HAVE_MBCS
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003974/* error handling callback helper:
3975 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00003976 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003977 and adjust various state variables.
3978 return 0 on success, -1 on error
3979*/
3980
Alexander Belopolsky40018472011-02-26 01:02:56 +00003981static int
Victor Stinnerfc009ef2012-11-07 00:36:38 +01003982unicode_decode_call_errorhandler_wchar(
3983 const char *errors, PyObject **errorHandler,
3984 const char *encoding, const char *reason,
3985 const char **input, const char **inend, Py_ssize_t *startinpos,
3986 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
3987 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003988{
Benjamin Peterson142957c2008-07-04 19:55:29 +00003989 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003990
3991 PyObject *restuple = NULL;
3992 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01003993 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003994 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003995 Py_ssize_t requiredsize;
3996 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003997 PyObject *inputobj = NULL;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01003998 wchar_t *repwstr;
3999 Py_ssize_t repwlen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004000
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004001 assert (_PyUnicode_KIND(*output) == PyUnicode_WCHAR_KIND);
4002 outsize = _PyUnicode_WSTR_LENGTH(*output);
Victor Stinner596a6c42011-11-09 00:02:18 +01004003
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004004 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004005 *errorHandler = PyCodec_LookupError(errors);
4006 if (*errorHandler == NULL)
4007 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004008 }
4009
Victor Stinner554f3f02010-06-16 23:33:54 +00004010 make_decode_exception(exceptionObject,
4011 encoding,
4012 *input, *inend - *input,
4013 *startinpos, *endinpos,
4014 reason);
4015 if (*exceptionObject == NULL)
4016 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004017
4018 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4019 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004020 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004021 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00004022 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004023 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004024 }
4025 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004026 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004027
4028 /* Copy back the bytes variables, which might have been modified by the
4029 callback */
4030 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4031 if (!inputobj)
4032 goto onError;
4033 if (!PyBytes_Check(inputobj)) {
4034 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
4035 }
4036 *input = PyBytes_AS_STRING(inputobj);
4037 insize = PyBytes_GET_SIZE(inputobj);
4038 *inend = *input + insize;
4039 /* we can DECREF safely, as the exception has another reference,
4040 so the object won't go away. */
4041 Py_DECREF(inputobj);
4042
4043 if (newpos<0)
4044 newpos = insize+newpos;
4045 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004046 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004047 goto onError;
4048 }
4049
4050 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4051 if (repwstr == NULL)
4052 goto onError;
4053 /* need more space? (at least enough for what we
4054 have+the replacement+the rest of the string (starting
4055 at the new input position), so we won't have to check space
4056 when there are no errors in the rest of the string) */
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004057 requiredsize = *outpos;
4058 if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4059 goto overflow;
4060 requiredsize += repwlen;
4061 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4062 goto overflow;
4063 requiredsize += insize - newpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004064 if (requiredsize > outsize) {
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004065 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004066 requiredsize = 2*outsize;
4067 if (unicode_resize(output, requiredsize) < 0)
4068 goto onError;
4069 }
4070 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4071 *outpos += repwlen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004072 *endinpos = newpos;
4073 *inptr = *input + newpos;
4074
4075 /* we made it! */
4076 Py_XDECREF(restuple);
4077 return 0;
4078
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004079 overflow:
4080 PyErr_SetString(PyExc_OverflowError,
4081 "decoded result is too long for a Python string");
4082
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004083 onError:
4084 Py_XDECREF(restuple);
4085 return -1;
4086}
4087#endif /* HAVE_MBCS */
4088
4089static int
4090unicode_decode_call_errorhandler_writer(
4091 const char *errors, PyObject **errorHandler,
4092 const char *encoding, const char *reason,
4093 const char **input, const char **inend, Py_ssize_t *startinpos,
4094 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4095 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4096{
4097 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
4098
4099 PyObject *restuple = NULL;
4100 PyObject *repunicode = NULL;
4101 Py_ssize_t insize;
4102 Py_ssize_t newpos;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004103 Py_ssize_t replen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004104 PyObject *inputobj = NULL;
4105
4106 if (*errorHandler == NULL) {
4107 *errorHandler = PyCodec_LookupError(errors);
4108 if (*errorHandler == NULL)
4109 goto onError;
4110 }
4111
4112 make_decode_exception(exceptionObject,
4113 encoding,
4114 *input, *inend - *input,
4115 *startinpos, *endinpos,
4116 reason);
4117 if (*exceptionObject == NULL)
4118 goto onError;
4119
4120 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4121 if (restuple == NULL)
4122 goto onError;
4123 if (!PyTuple_Check(restuple)) {
4124 PyErr_SetString(PyExc_TypeError, &argparse[4]);
4125 goto onError;
4126 }
4127 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004128 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004129
4130 /* Copy back the bytes variables, which might have been modified by the
4131 callback */
4132 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4133 if (!inputobj)
4134 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00004135 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004136 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00004137 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004138 *input = PyBytes_AS_STRING(inputobj);
4139 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004140 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004141 /* we can DECREF safely, as the exception has another reference,
4142 so the object won't go away. */
4143 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004144
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004145 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004146 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004147 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004148 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00004149 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004150 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004151
Victor Stinner8f674cc2013-04-17 23:02:17 +02004152 if (PyUnicode_READY(repunicode) < 0)
4153 goto onError;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004154 replen = PyUnicode_GET_LENGTH(repunicode);
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004155 if (replen > 1) {
4156 writer->min_length += replen - 1;
Victor Stinner8f674cc2013-04-17 23:02:17 +02004157 writer->overallocate = 1;
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004158 if (_PyUnicodeWriter_Prepare(writer, writer->min_length,
4159 PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4160 goto onError;
4161 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004162 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
Victor Stinner376cfa12013-04-17 23:58:16 +02004163 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004164
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004165 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004166 *inptr = *input + newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004167
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004168 /* we made it! */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004169 Py_XDECREF(restuple);
4170 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004171
Benjamin Peterson29060642009-01-31 22:14:21 +00004172 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004173 Py_XDECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004174 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004175}
4176
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004177/* --- UTF-7 Codec -------------------------------------------------------- */
4178
Antoine Pitrou244651a2009-05-04 18:56:13 +00004179/* See RFC2152 for details. We encode conservatively and decode liberally. */
4180
4181/* Three simple macros defining base-64. */
4182
4183/* Is c a base-64 character? */
4184
4185#define IS_BASE64(c) \
4186 (((c) >= 'A' && (c) <= 'Z') || \
4187 ((c) >= 'a' && (c) <= 'z') || \
4188 ((c) >= '0' && (c) <= '9') || \
4189 (c) == '+' || (c) == '/')
4190
4191/* given that c is a base-64 character, what is its base-64 value? */
4192
4193#define FROM_BASE64(c) \
4194 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4195 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4196 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4197 (c) == '+' ? 62 : 63)
4198
4199/* What is the base-64 character of the bottom 6 bits of n? */
4200
4201#define TO_BASE64(n) \
4202 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4203
4204/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4205 * decoded as itself. We are permissive on decoding; the only ASCII
4206 * byte not decoding to itself is the + which begins a base64
4207 * string. */
4208
4209#define DECODE_DIRECT(c) \
4210 ((c) <= 127 && (c) != '+')
4211
4212/* The UTF-7 encoder treats ASCII characters differently according to
4213 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4214 * the above). See RFC2152. This array identifies these different
4215 * sets:
4216 * 0 : "Set D"
4217 * alphanumeric and '(),-./:?
4218 * 1 : "Set O"
4219 * !"#$%&*;<=>@[]^_`{|}
4220 * 2 : "whitespace"
4221 * ht nl cr sp
4222 * 3 : special (must be base64 encoded)
4223 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4224 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004225
Tim Petersced69f82003-09-16 20:30:58 +00004226static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004227char utf7_category[128] = {
4228/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4229 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4230/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4231 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4232/* sp ! " # $ % & ' ( ) * + , - . / */
4233 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4234/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4235 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4236/* @ A B C D E F G H I J K L M N O */
4237 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4238/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4239 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4240/* ` a b c d e f g h i j k l m n o */
4241 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4242/* p q r s t u v w x y z { | } ~ del */
4243 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004244};
4245
Antoine Pitrou244651a2009-05-04 18:56:13 +00004246/* ENCODE_DIRECT: this character should be encoded as itself. The
4247 * answer depends on whether we are encoding set O as itself, and also
4248 * on whether we are encoding whitespace as itself. RFC2152 makes it
4249 * clear that the answers to these questions vary between
4250 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004251
Antoine Pitrou244651a2009-05-04 18:56:13 +00004252#define ENCODE_DIRECT(c, directO, directWS) \
4253 ((c) < 128 && (c) > 0 && \
4254 ((utf7_category[(c)] == 0) || \
4255 (directWS && (utf7_category[(c)] == 2)) || \
4256 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004257
Alexander Belopolsky40018472011-02-26 01:02:56 +00004258PyObject *
4259PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004260 Py_ssize_t size,
4261 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004262{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004263 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4264}
4265
Antoine Pitrou244651a2009-05-04 18:56:13 +00004266/* The decoder. The only state we preserve is our read position,
4267 * i.e. how many characters we have consumed. So if we end in the
4268 * middle of a shift sequence we have to back off the read position
4269 * and the output to the beginning of the sequence, otherwise we lose
4270 * all the shift state (seen bits, number of bits seen, high
4271 * surrogate). */
4272
Alexander Belopolsky40018472011-02-26 01:02:56 +00004273PyObject *
4274PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004275 Py_ssize_t size,
4276 const char *errors,
4277 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004278{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004279 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004280 Py_ssize_t startinpos;
4281 Py_ssize_t endinpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004282 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004283 _PyUnicodeWriter writer;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004284 const char *errmsg = "";
4285 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004286 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004287 unsigned int base64bits = 0;
4288 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004289 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004290 PyObject *errorHandler = NULL;
4291 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004292
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004293 if (size == 0) {
4294 if (consumed)
4295 *consumed = 0;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004296 _Py_RETURN_UNICODE_EMPTY();
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004297 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004298
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004299 /* Start off assuming it's all ASCII. Widen later as necessary. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02004300 _PyUnicodeWriter_Init(&writer);
4301 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004302
4303 shiftOutStart = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004304 e = s + size;
4305
4306 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004307 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004308 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004309 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004310
Antoine Pitrou244651a2009-05-04 18:56:13 +00004311 if (inShift) { /* in a base-64 section */
4312 if (IS_BASE64(ch)) { /* consume a base-64 character */
4313 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4314 base64bits += 6;
4315 s++;
4316 if (base64bits >= 16) {
4317 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004318 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004319 base64bits -= 16;
4320 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004321 assert(outCh <= 0xffff);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004322 if (surrogate) {
4323 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004324 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4325 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004326 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004327 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004328 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004329 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004330 }
4331 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004332 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004333 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004334 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004335 }
4336 }
Victor Stinner551ac952011-11-29 22:58:13 +01004337 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004338 /* first surrogate */
4339 surrogate = outCh;
4340 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004341 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004342 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004343 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004344 }
4345 }
4346 }
4347 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004348 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004349 if (base64bits > 0) { /* left-over bits */
4350 if (base64bits >= 6) {
4351 /* We've seen at least one base-64 character */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004352 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004353 errmsg = "partial character in shift sequence";
4354 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004355 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004356 else {
4357 /* Some bits remain; they should be zero */
4358 if (base64buffer != 0) {
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004359 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004360 errmsg = "non-zero padding bits in shift sequence";
4361 goto utf7Error;
4362 }
4363 }
4364 }
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004365 if (surrogate && DECODE_DIRECT(ch)) {
4366 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4367 goto onError;
4368 }
4369 surrogate = 0;
4370 if (ch == '-') {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004371 /* '-' is absorbed; other terminating
4372 characters are preserved */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004373 s++;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004374 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004375 }
4376 }
4377 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004378 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004379 s++; /* consume '+' */
4380 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004381 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004382 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004383 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004384 }
4385 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004386 inShift = 1;
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004387 surrogate = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004388 shiftOutStart = writer.pos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004389 base64bits = 0;
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004390 base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004391 }
4392 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004393 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004394 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004395 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004396 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004397 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004398 else {
4399 startinpos = s-starts;
4400 s++;
4401 errmsg = "unexpected special character";
4402 goto utf7Error;
4403 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004404 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004405utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004406 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004407 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004408 errors, &errorHandler,
4409 "utf7", errmsg,
4410 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004411 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004412 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004413 }
4414
Antoine Pitrou244651a2009-05-04 18:56:13 +00004415 /* end of string */
4416
4417 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4418 /* if we're in an inconsistent state, that's an error */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004419 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004420 if (surrogate ||
4421 (base64bits >= 6) ||
4422 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004423 endinpos = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004424 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrou244651a2009-05-04 18:56:13 +00004425 errors, &errorHandler,
4426 "utf7", "unterminated shift sequence",
4427 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004428 &writer))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004429 goto onError;
4430 if (s < e)
4431 goto restart;
4432 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004433 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004434
4435 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004436 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004437 if (inShift) {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004438 *consumed = startinpos;
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004439 if (writer.pos != shiftOutStart && writer.maxchar > 127) {
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004440 PyObject *result = PyUnicode_FromKindAndData(
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004441 writer.kind, writer.data, shiftOutStart);
4442 Py_XDECREF(errorHandler);
4443 Py_XDECREF(exc);
4444 _PyUnicodeWriter_Dealloc(&writer);
4445 return result;
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004446 }
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004447 writer.pos = shiftOutStart; /* back off output */
Antoine Pitrou244651a2009-05-04 18:56:13 +00004448 }
4449 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004450 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004451 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004452 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004453
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004454 Py_XDECREF(errorHandler);
4455 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004456 return _PyUnicodeWriter_Finish(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004457
Benjamin Peterson29060642009-01-31 22:14:21 +00004458 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004459 Py_XDECREF(errorHandler);
4460 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004461 _PyUnicodeWriter_Dealloc(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004462 return NULL;
4463}
4464
4465
Alexander Belopolsky40018472011-02-26 01:02:56 +00004466PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004467_PyUnicode_EncodeUTF7(PyObject *str,
4468 int base64SetO,
4469 int base64WhiteSpace,
4470 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004471{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004472 int kind;
4473 void *data;
4474 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004475 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004476 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004477 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004478 unsigned int base64bits = 0;
4479 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004480 char * out;
4481 char * start;
4482
Benjamin Petersonbac79492012-01-14 13:34:47 -05004483 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004484 return NULL;
4485 kind = PyUnicode_KIND(str);
4486 data = PyUnicode_DATA(str);
4487 len = PyUnicode_GET_LENGTH(str);
4488
4489 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004490 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004491
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004492 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004493 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004494 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004495 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004496 if (v == NULL)
4497 return NULL;
4498
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004499 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004500 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004501 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004502
Antoine Pitrou244651a2009-05-04 18:56:13 +00004503 if (inShift) {
4504 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4505 /* shifting out */
4506 if (base64bits) { /* output remaining bits */
4507 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4508 base64buffer = 0;
4509 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004510 }
4511 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004512 /* Characters not in the BASE64 set implicitly unshift the sequence
4513 so no '-' is required, except if the character is itself a '-' */
4514 if (IS_BASE64(ch) || ch == '-') {
4515 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004516 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004517 *out++ = (char) ch;
4518 }
4519 else {
4520 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004521 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004522 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004523 else { /* not in a shift sequence */
4524 if (ch == '+') {
4525 *out++ = '+';
4526 *out++ = '-';
4527 }
4528 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4529 *out++ = (char) ch;
4530 }
4531 else {
4532 *out++ = '+';
4533 inShift = 1;
4534 goto encode_char;
4535 }
4536 }
4537 continue;
4538encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004539 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004540 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004541
Antoine Pitrou244651a2009-05-04 18:56:13 +00004542 /* code first surrogate */
4543 base64bits += 16;
Victor Stinner76df43d2012-10-30 01:42:39 +01004544 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004545 while (base64bits >= 6) {
4546 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4547 base64bits -= 6;
4548 }
4549 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004550 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004551 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004552 base64bits += 16;
4553 base64buffer = (base64buffer << 16) | ch;
4554 while (base64bits >= 6) {
4555 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4556 base64bits -= 6;
4557 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004558 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004559 if (base64bits)
4560 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4561 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004562 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004563 if (_PyBytes_Resize(&v, out - start) < 0)
4564 return NULL;
4565 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004566}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004567PyObject *
4568PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4569 Py_ssize_t size,
4570 int base64SetO,
4571 int base64WhiteSpace,
4572 const char *errors)
4573{
4574 PyObject *result;
4575 PyObject *tmp = PyUnicode_FromUnicode(s, size);
4576 if (tmp == NULL)
4577 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004578 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004579 base64WhiteSpace, errors);
4580 Py_DECREF(tmp);
4581 return result;
4582}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004583
Antoine Pitrou244651a2009-05-04 18:56:13 +00004584#undef IS_BASE64
4585#undef FROM_BASE64
4586#undef TO_BASE64
4587#undef DECODE_DIRECT
4588#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004589
Guido van Rossumd57fd912000-03-10 22:53:23 +00004590/* --- UTF-8 Codec -------------------------------------------------------- */
4591
Alexander Belopolsky40018472011-02-26 01:02:56 +00004592PyObject *
4593PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004594 Py_ssize_t size,
4595 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004596{
Walter Dörwald69652032004-09-07 20:24:22 +00004597 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4598}
4599
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004600#include "stringlib/asciilib.h"
4601#include "stringlib/codecs.h"
4602#include "stringlib/undef.h"
4603
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004604#include "stringlib/ucs1lib.h"
4605#include "stringlib/codecs.h"
4606#include "stringlib/undef.h"
4607
4608#include "stringlib/ucs2lib.h"
4609#include "stringlib/codecs.h"
4610#include "stringlib/undef.h"
4611
4612#include "stringlib/ucs4lib.h"
4613#include "stringlib/codecs.h"
4614#include "stringlib/undef.h"
4615
Antoine Pitrouab868312009-01-10 15:40:25 +00004616/* Mask to quickly check whether a C 'long' contains a
4617 non-ASCII, UTF8-encoded char. */
4618#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004619# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004620#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004621# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004622#else
4623# error C 'long' size should be either 4 or 8!
4624#endif
4625
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004626static Py_ssize_t
4627ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004628{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004629 const char *p = start;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004630 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004631
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004632 /*
4633 * Issue #17237: m68k is a bit different from most architectures in
4634 * that objects do not use "natural alignment" - for example, int and
4635 * long are only aligned at 2-byte boundaries. Therefore the assert()
4636 * won't work; also, tests have shown that skipping the "optimised
4637 * version" will even speed up m68k.
4638 */
4639#if !defined(__m68k__)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004640#if SIZEOF_LONG <= SIZEOF_VOID_P
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004641 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4642 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004643 /* Fast path, see in STRINGLIB(utf8_decode) for
4644 an explanation. */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004645 /* Help allocation */
4646 const char *_p = p;
4647 Py_UCS1 * q = dest;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004648 while (_p < aligned_end) {
4649 unsigned long value = *(const unsigned long *) _p;
4650 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00004651 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004652 *((unsigned long *)q) = value;
4653 _p += SIZEOF_LONG;
4654 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004655 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004656 p = _p;
4657 while (p < end) {
4658 if ((unsigned char)*p & 0x80)
4659 break;
4660 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004661 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004662 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004663 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004664#endif
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004665#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004666 while (p < end) {
4667 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4668 for an explanation. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004669 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004670 /* Help allocation */
4671 const char *_p = p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004672 while (_p < aligned_end) {
4673 unsigned long value = *(unsigned long *) _p;
4674 if (value & ASCII_CHAR_MASK)
4675 break;
4676 _p += SIZEOF_LONG;
4677 }
4678 p = _p;
4679 if (_p == end)
4680 break;
4681 }
4682 if ((unsigned char)*p & 0x80)
4683 break;
4684 ++p;
4685 }
4686 memcpy(dest, start, p - start);
4687 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004688}
Antoine Pitrouab868312009-01-10 15:40:25 +00004689
Victor Stinner785938e2011-12-11 20:09:03 +01004690PyObject *
4691PyUnicode_DecodeUTF8Stateful(const char *s,
4692 Py_ssize_t size,
4693 const char *errors,
4694 Py_ssize_t *consumed)
4695{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004696 _PyUnicodeWriter writer;
Victor Stinner785938e2011-12-11 20:09:03 +01004697 const char *starts = s;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004698 const char *end = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004699
4700 Py_ssize_t startinpos;
4701 Py_ssize_t endinpos;
4702 const char *errmsg = "";
4703 PyObject *errorHandler = NULL;
4704 PyObject *exc = NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004705
4706 if (size == 0) {
4707 if (consumed)
4708 *consumed = 0;
Serhiy Storchaka678db842013-01-26 12:16:36 +02004709 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner785938e2011-12-11 20:09:03 +01004710 }
4711
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004712 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4713 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner785938e2011-12-11 20:09:03 +01004714 if (consumed)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004715 *consumed = 1;
4716 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01004717 }
4718
Victor Stinner8f674cc2013-04-17 23:02:17 +02004719 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02004720 writer.min_length = size;
4721 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004722 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004723
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004724 writer.pos = ascii_decode(s, end, writer.data);
4725 s += writer.pos;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004726 while (s < end) {
4727 Py_UCS4 ch;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004728 int kind = writer.kind;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004729 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004730 if (PyUnicode_IS_ASCII(writer.buffer))
4731 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004732 else
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004733 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004734 } else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004735 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004736 } else {
4737 assert(kind == PyUnicode_4BYTE_KIND);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004738 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004739 }
4740
4741 switch (ch) {
4742 case 0:
4743 if (s == end || consumed)
4744 goto End;
4745 errmsg = "unexpected end of data";
4746 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004747 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004748 break;
4749 case 1:
4750 errmsg = "invalid start byte";
4751 startinpos = s - starts;
4752 endinpos = startinpos + 1;
4753 break;
4754 case 2:
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004755 case 3:
4756 case 4:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004757 errmsg = "invalid continuation byte";
4758 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004759 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004760 break;
4761 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004762 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004763 goto onError;
4764 continue;
4765 }
4766
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004767 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004768 errors, &errorHandler,
4769 "utf-8", errmsg,
4770 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004771 &writer))
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004772 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004773 }
4774
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004775End:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004776 if (consumed)
4777 *consumed = s - starts;
4778
4779 Py_XDECREF(errorHandler);
4780 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004781 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004782
4783onError:
4784 Py_XDECREF(errorHandler);
4785 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004786 _PyUnicodeWriter_Dealloc(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004787 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004788}
4789
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004790#ifdef __APPLE__
4791
4792/* Simplified UTF-8 decoder using surrogateescape error handler,
Victor Stinner0d92c4f2012-11-12 23:32:21 +01004793 used to decode the command line arguments on Mac OS X.
4794
4795 Return a pointer to a newly allocated wide character string (use
Victor Stinner6f8eeee2013-07-07 22:57:45 +02004796 PyMem_RawFree() to free the memory), or NULL on memory allocation error. */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004797
4798wchar_t*
4799_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4800{
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004801 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004802 wchar_t *unicode;
4803 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004804
4805 /* Note: size will always be longer than the resulting Unicode
4806 character count */
Victor Stinnerf50e1872015-03-20 11:32:24 +01004807 if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1))
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004808 return NULL;
Victor Stinner6f8eeee2013-07-07 22:57:45 +02004809 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004810 if (!unicode)
4811 return NULL;
4812
4813 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004814 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004815 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004816 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004817 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004818#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004819 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004820#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004821 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004822#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004823 if (ch > 0xFF) {
4824#if SIZEOF_WCHAR_T == 4
4825 assert(0);
4826#else
4827 assert(Py_UNICODE_IS_SURROGATE(ch));
4828 /* compute and append the two surrogates: */
4829 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
4830 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
4831#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004832 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004833 else {
4834 if (!ch && s == e)
4835 break;
4836 /* surrogateescape */
4837 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
4838 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004839 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004840 unicode[outpos] = L'\0';
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004841 return unicode;
4842}
4843
4844#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00004845
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004846/* Primary internal function which creates utf8 encoded bytes objects.
4847
4848 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00004849 and allocate exactly as much space needed at the end. Else allocate the
4850 maximum possible needed (4 result bytes per Unicode character), and return
4851 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004852*/
Tim Peters7e3d9612002-04-21 03:26:37 +00004853PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01004854_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004855{
Victor Stinner6099a032011-12-18 14:22:26 +01004856 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004857 void *data;
4858 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00004859
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004860 if (!PyUnicode_Check(unicode)) {
4861 PyErr_BadArgument();
4862 return NULL;
4863 }
4864
4865 if (PyUnicode_READY(unicode) == -1)
4866 return NULL;
4867
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004868 if (PyUnicode_UTF8(unicode))
4869 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4870 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004871
4872 kind = PyUnicode_KIND(unicode);
4873 data = PyUnicode_DATA(unicode);
4874 size = PyUnicode_GET_LENGTH(unicode);
4875
Benjamin Petersonead6b532011-12-20 17:23:42 -06004876 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01004877 default:
4878 assert(0);
4879 case PyUnicode_1BYTE_KIND:
4880 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
4881 assert(!PyUnicode_IS_ASCII(unicode));
4882 return ucs1lib_utf8_encoder(unicode, data, size, errors);
4883 case PyUnicode_2BYTE_KIND:
4884 return ucs2lib_utf8_encoder(unicode, data, size, errors);
4885 case PyUnicode_4BYTE_KIND:
4886 return ucs4lib_utf8_encoder(unicode, data, size, errors);
Tim Peters602f7402002-04-27 18:03:26 +00004887 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004888}
4889
Alexander Belopolsky40018472011-02-26 01:02:56 +00004890PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004891PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4892 Py_ssize_t size,
4893 const char *errors)
4894{
4895 PyObject *v, *unicode;
4896
4897 unicode = PyUnicode_FromUnicode(s, size);
4898 if (unicode == NULL)
4899 return NULL;
4900 v = _PyUnicode_AsUTF8String(unicode, errors);
4901 Py_DECREF(unicode);
4902 return v;
4903}
4904
4905PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004906PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004907{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004908 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004909}
4910
Walter Dörwald41980ca2007-08-16 21:55:45 +00004911/* --- UTF-32 Codec ------------------------------------------------------- */
4912
4913PyObject *
4914PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004915 Py_ssize_t size,
4916 const char *errors,
4917 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004918{
4919 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4920}
4921
4922PyObject *
4923PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004924 Py_ssize_t size,
4925 const char *errors,
4926 int *byteorder,
4927 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004928{
4929 const char *starts = s;
4930 Py_ssize_t startinpos;
4931 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004932 _PyUnicodeWriter writer;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004933 const unsigned char *q, *e;
Victor Stinnere64322e2012-10-30 23:12:47 +01004934 int le, bo = 0; /* assume native ordering by default */
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02004935 const char *encoding;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004936 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004937 PyObject *errorHandler = NULL;
4938 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004939
Walter Dörwald41980ca2007-08-16 21:55:45 +00004940 q = (unsigned char *)s;
4941 e = q + size;
4942
4943 if (byteorder)
4944 bo = *byteorder;
4945
4946 /* Check for BOM marks (U+FEFF) in the input and adjust current
4947 byte order setting accordingly. In native mode, the leading BOM
4948 mark is skipped, in all other modes, it is copied to the output
4949 stream as-is (giving a ZWNBSP character). */
Victor Stinnere64322e2012-10-30 23:12:47 +01004950 if (bo == 0 && size >= 4) {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07004951 Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01004952 if (bom == 0x0000FEFF) {
4953 bo = -1;
4954 q += 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00004955 }
Victor Stinnere64322e2012-10-30 23:12:47 +01004956 else if (bom == 0xFFFE0000) {
4957 bo = 1;
4958 q += 4;
4959 }
4960 if (byteorder)
4961 *byteorder = bo;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004962 }
4963
Victor Stinnere64322e2012-10-30 23:12:47 +01004964 if (q == e) {
4965 if (consumed)
4966 *consumed = size;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004967 _Py_RETURN_UNICODE_EMPTY();
Walter Dörwald41980ca2007-08-16 21:55:45 +00004968 }
4969
Victor Stinnere64322e2012-10-30 23:12:47 +01004970#ifdef WORDS_BIGENDIAN
4971 le = bo < 0;
4972#else
4973 le = bo <= 0;
4974#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02004975 encoding = le ? "utf-32-le" : "utf-32-be";
Victor Stinnere64322e2012-10-30 23:12:47 +01004976
Victor Stinner8f674cc2013-04-17 23:02:17 +02004977 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02004978 writer.min_length = (e - q + 3) / 4;
4979 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004980 goto onError;
Victor Stinnere64322e2012-10-30 23:12:47 +01004981
Victor Stinnere64322e2012-10-30 23:12:47 +01004982 while (1) {
4983 Py_UCS4 ch = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004984 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004985
Victor Stinnere64322e2012-10-30 23:12:47 +01004986 if (e - q >= 4) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004987 enum PyUnicode_Kind kind = writer.kind;
4988 void *data = writer.data;
Victor Stinnere64322e2012-10-30 23:12:47 +01004989 const unsigned char *last = e - 4;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004990 Py_ssize_t pos = writer.pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01004991 if (le) {
4992 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07004993 ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01004994 if (ch > maxch)
4995 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02004996 if (kind != PyUnicode_1BYTE_KIND &&
4997 Py_UNICODE_IS_SURROGATE(ch))
4998 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004999 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005000 q += 4;
5001 } while (q <= last);
5002 }
5003 else {
5004 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005005 ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
Victor Stinnere64322e2012-10-30 23:12:47 +01005006 if (ch > maxch)
5007 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005008 if (kind != PyUnicode_1BYTE_KIND &&
5009 Py_UNICODE_IS_SURROGATE(ch))
5010 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005011 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005012 q += 4;
5013 } while (q <= last);
5014 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005015 writer.pos = pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005016 }
5017
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005018 if (Py_UNICODE_IS_SURROGATE(ch)) {
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005019 errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005020 startinpos = ((const char *)q) - starts;
5021 endinpos = startinpos + 4;
5022 }
5023 else if (ch <= maxch) {
Victor Stinnere64322e2012-10-30 23:12:47 +01005024 if (q == e || consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005025 break;
Victor Stinnere64322e2012-10-30 23:12:47 +01005026 /* remaining bytes at the end? (size should be divisible by 4) */
Benjamin Peterson29060642009-01-31 22:14:21 +00005027 errmsg = "truncated data";
Victor Stinnere64322e2012-10-30 23:12:47 +01005028 startinpos = ((const char *)q) - starts;
5029 endinpos = ((const char *)e) - starts;
Benjamin Peterson29060642009-01-31 22:14:21 +00005030 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005031 else {
5032 if (ch < 0x110000) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005033 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnere64322e2012-10-30 23:12:47 +01005034 goto onError;
5035 q += 4;
5036 continue;
5037 }
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005038 errmsg = "code point not in range(0x110000)";
Victor Stinnere64322e2012-10-30 23:12:47 +01005039 startinpos = ((const char *)q) - starts;
5040 endinpos = startinpos + 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005041 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005042
5043 /* The remaining input chars are ignored if the callback
5044 chooses to skip the input */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005045 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005046 errors, &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005047 encoding, errmsg,
Benjamin Peterson29060642009-01-31 22:14:21 +00005048 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005049 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005050 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005051 }
5052
Walter Dörwald41980ca2007-08-16 21:55:45 +00005053 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005054 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005055
Walter Dörwald41980ca2007-08-16 21:55:45 +00005056 Py_XDECREF(errorHandler);
5057 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005058 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005059
Benjamin Peterson29060642009-01-31 22:14:21 +00005060 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005061 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005062 Py_XDECREF(errorHandler);
5063 Py_XDECREF(exc);
5064 return NULL;
5065}
5066
5067PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005068_PyUnicode_EncodeUTF32(PyObject *str,
5069 const char *errors,
5070 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005071{
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005072 enum PyUnicode_Kind kind;
5073 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005074 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005075 PyObject *v;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005076 PY_UINT32_T *out;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005077#if PY_LITTLE_ENDIAN
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005078 int native_ordering = byteorder <= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005079#else
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005080 int native_ordering = byteorder >= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005081#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005082 const char *encoding;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005083 Py_ssize_t nsize, pos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005084 PyObject *errorHandler = NULL;
5085 PyObject *exc = NULL;
5086 PyObject *rep = NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005087
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005088 if (!PyUnicode_Check(str)) {
5089 PyErr_BadArgument();
5090 return NULL;
5091 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005092 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005093 return NULL;
5094 kind = PyUnicode_KIND(str);
5095 data = PyUnicode_DATA(str);
5096 len = PyUnicode_GET_LENGTH(str);
5097
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005098 if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
Serhiy Storchaka30793282014-01-04 22:44:01 +02005099 return PyErr_NoMemory();
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005100 nsize = len + (byteorder == 0);
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005101 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005102 if (v == NULL)
5103 return NULL;
5104
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005105 /* output buffer is 4-bytes aligned */
5106 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
5107 out = (PY_UINT32_T *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005108 if (byteorder == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005109 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005110 if (len == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005111 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005112
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005113 if (byteorder == -1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005114 encoding = "utf-32-le";
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005115 else if (byteorder == 1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005116 encoding = "utf-32-be";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005117 else
5118 encoding = "utf-32";
5119
5120 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005121 ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5122 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005123 }
5124
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005125 pos = 0;
5126 while (pos < len) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005127 Py_ssize_t repsize, moreunits;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005128
5129 if (kind == PyUnicode_2BYTE_KIND) {
5130 pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5131 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005132 }
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005133 else {
5134 assert(kind == PyUnicode_4BYTE_KIND);
5135 pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5136 &out, native_ordering);
5137 }
5138 if (pos == len)
5139 break;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005140
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005141 rep = unicode_encode_call_errorhandler(
5142 errors, &errorHandler,
5143 encoding, "surrogates not allowed",
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005144 str, &exc, pos, pos + 1, &pos);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005145 if (!rep)
5146 goto error;
5147
5148 if (PyBytes_Check(rep)) {
5149 repsize = PyBytes_GET_SIZE(rep);
5150 if (repsize & 3) {
5151 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005152 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005153 "surrogates not allowed");
5154 goto error;
5155 }
5156 moreunits = repsize / 4;
5157 }
5158 else {
5159 assert(PyUnicode_Check(rep));
5160 if (PyUnicode_READY(rep) < 0)
5161 goto error;
5162 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5163 if (!PyUnicode_IS_ASCII(rep)) {
5164 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005165 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005166 "surrogates not allowed");
5167 goto error;
5168 }
5169 }
5170
5171 /* four bytes are reserved for each surrogate */
5172 if (moreunits > 1) {
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005173 Py_ssize_t outpos = out - (PY_UINT32_T*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005174 Py_ssize_t morebytes = 4 * (moreunits - 1);
5175 if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) {
5176 /* integer overflow */
5177 PyErr_NoMemory();
5178 goto error;
5179 }
5180 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0)
5181 goto error;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005182 out = (PY_UINT32_T*) PyBytes_AS_STRING(v) + outpos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005183 }
5184
5185 if (PyBytes_Check(rep)) {
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005186 Py_MEMCPY(out, PyBytes_AS_STRING(rep), repsize);
5187 out += moreunits;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005188 } else /* rep is unicode */ {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005189 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005190 ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5191 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005192 }
5193
5194 Py_CLEAR(rep);
5195 }
5196
5197 /* Cut back to size actually needed. This is necessary for, for example,
5198 encoding of a string containing isolated surrogates and the 'ignore'
5199 handler is used. */
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005200 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005201 if (nsize != PyBytes_GET_SIZE(v))
5202 _PyBytes_Resize(&v, nsize);
5203 Py_XDECREF(errorHandler);
5204 Py_XDECREF(exc);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005205 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005206 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005207 error:
5208 Py_XDECREF(rep);
5209 Py_XDECREF(errorHandler);
5210 Py_XDECREF(exc);
5211 Py_XDECREF(v);
5212 return NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005213}
5214
Alexander Belopolsky40018472011-02-26 01:02:56 +00005215PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005216PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5217 Py_ssize_t size,
5218 const char *errors,
5219 int byteorder)
5220{
5221 PyObject *result;
5222 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5223 if (tmp == NULL)
5224 return NULL;
5225 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5226 Py_DECREF(tmp);
5227 return result;
5228}
5229
5230PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005231PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005232{
Victor Stinnerb960b342011-11-20 19:12:52 +01005233 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005234}
5235
Guido van Rossumd57fd912000-03-10 22:53:23 +00005236/* --- UTF-16 Codec ------------------------------------------------------- */
5237
Tim Peters772747b2001-08-09 22:21:55 +00005238PyObject *
5239PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005240 Py_ssize_t size,
5241 const char *errors,
5242 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005243{
Walter Dörwald69652032004-09-07 20:24:22 +00005244 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5245}
5246
5247PyObject *
5248PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005249 Py_ssize_t size,
5250 const char *errors,
5251 int *byteorder,
5252 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005253{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005254 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005255 Py_ssize_t startinpos;
5256 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005257 _PyUnicodeWriter writer;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005258 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005259 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005260 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005261 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005262 PyObject *errorHandler = NULL;
5263 PyObject *exc = NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005264 const char *encoding;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005265
Tim Peters772747b2001-08-09 22:21:55 +00005266 q = (unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005267 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005268
5269 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005270 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005271
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005272 /* Check for BOM marks (U+FEFF) in the input and adjust current
5273 byte order setting accordingly. In native mode, the leading BOM
5274 mark is skipped, in all other modes, it is copied to the output
5275 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005276 if (bo == 0 && size >= 2) {
5277 const Py_UCS4 bom = (q[1] << 8) | q[0];
5278 if (bom == 0xFEFF) {
5279 q += 2;
5280 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005281 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005282 else if (bom == 0xFFFE) {
5283 q += 2;
5284 bo = 1;
5285 }
5286 if (byteorder)
5287 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005288 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005289
Antoine Pitrou63065d72012-05-15 23:48:04 +02005290 if (q == e) {
5291 if (consumed)
5292 *consumed = size;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005293 _Py_RETURN_UNICODE_EMPTY();
Tim Peters772747b2001-08-09 22:21:55 +00005294 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005295
Christian Heimes743e0cd2012-10-17 23:52:17 +02005296#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02005297 native_ordering = bo <= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005298 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
Antoine Pitrouab868312009-01-10 15:40:25 +00005299#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02005300 native_ordering = bo >= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005301 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
Antoine Pitrouab868312009-01-10 15:40:25 +00005302#endif
Tim Peters772747b2001-08-09 22:21:55 +00005303
Antoine Pitrou63065d72012-05-15 23:48:04 +02005304 /* Note: size will always be longer than the resulting Unicode
5305 character count */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005306 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005307 writer.min_length = (e - q + 1) / 2;
5308 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005309 goto onError;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005310
Antoine Pitrou63065d72012-05-15 23:48:04 +02005311 while (1) {
5312 Py_UCS4 ch = 0;
5313 if (e - q >= 2) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005314 int kind = writer.kind;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005315 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005316 if (PyUnicode_IS_ASCII(writer.buffer))
Antoine Pitrou63065d72012-05-15 23:48:04 +02005317 ch = asciilib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005318 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005319 native_ordering);
5320 else
5321 ch = ucs1lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005322 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005323 native_ordering);
5324 } else if (kind == PyUnicode_2BYTE_KIND) {
5325 ch = ucs2lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005326 (Py_UCS2*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005327 native_ordering);
5328 } else {
5329 assert(kind == PyUnicode_4BYTE_KIND);
5330 ch = ucs4lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005331 (Py_UCS4*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005332 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00005333 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005334 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005335
Antoine Pitrou63065d72012-05-15 23:48:04 +02005336 switch (ch)
5337 {
5338 case 0:
5339 /* remaining byte at the end? (size should be even) */
5340 if (q == e || consumed)
5341 goto End;
5342 errmsg = "truncated data";
5343 startinpos = ((const char *)q) - starts;
5344 endinpos = ((const char *)e) - starts;
5345 break;
5346 /* The remaining input chars are ignored if the callback
5347 chooses to skip the input */
5348 case 1:
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005349 q -= 2;
5350 if (consumed)
Serhiy Storchakaae3b32a2013-01-08 23:40:52 +02005351 goto End;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005352 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005353 startinpos = ((const char *)q) - starts;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005354 endinpos = ((const char *)e) - starts;
5355 break;
5356 case 2:
5357 errmsg = "illegal encoding";
5358 startinpos = ((const char *)q) - 2 - starts;
5359 endinpos = startinpos + 2;
5360 break;
5361 case 3:
5362 errmsg = "illegal UTF-16 surrogate";
5363 startinpos = ((const char *)q) - 4 - starts;
5364 endinpos = startinpos + 2;
5365 break;
5366 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005367 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005368 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005369 continue;
5370 }
5371
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005372 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouab868312009-01-10 15:40:25 +00005373 errors,
5374 &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005375 encoding, errmsg,
Antoine Pitrouab868312009-01-10 15:40:25 +00005376 &starts,
5377 (const char **)&e,
5378 &startinpos,
5379 &endinpos,
5380 &exc,
5381 (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005382 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005383 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005384 }
5385
Antoine Pitrou63065d72012-05-15 23:48:04 +02005386End:
Walter Dörwald69652032004-09-07 20:24:22 +00005387 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005388 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005389
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005390 Py_XDECREF(errorHandler);
5391 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005392 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005393
Benjamin Peterson29060642009-01-31 22:14:21 +00005394 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005395 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005396 Py_XDECREF(errorHandler);
5397 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005398 return NULL;
5399}
5400
Tim Peters772747b2001-08-09 22:21:55 +00005401PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005402_PyUnicode_EncodeUTF16(PyObject *str,
5403 const char *errors,
5404 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005405{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005406 enum PyUnicode_Kind kind;
5407 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005408 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005409 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005410 unsigned short *out;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005411 Py_ssize_t pairs;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005412#if PY_BIG_ENDIAN
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005413 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005414#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005415 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005416#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005417 const char *encoding;
5418 Py_ssize_t nsize, pos;
5419 PyObject *errorHandler = NULL;
5420 PyObject *exc = NULL;
5421 PyObject *rep = NULL;
Tim Peters772747b2001-08-09 22:21:55 +00005422
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005423 if (!PyUnicode_Check(str)) {
5424 PyErr_BadArgument();
5425 return NULL;
5426 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005427 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005428 return NULL;
5429 kind = PyUnicode_KIND(str);
5430 data = PyUnicode_DATA(str);
5431 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005432
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005433 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005434 if (kind == PyUnicode_4BYTE_KIND) {
5435 const Py_UCS4 *in = (const Py_UCS4 *)data;
5436 const Py_UCS4 *end = in + len;
5437 while (in < end)
5438 if (*in++ >= 0x10000)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005439 pairs++;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005440 }
5441 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005442 return PyErr_NoMemory();
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005443 nsize = len + pairs + (byteorder == 0);
5444 v = PyBytes_FromStringAndSize(NULL, nsize * 2);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005445 if (v == NULL)
5446 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005447
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005448 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02005449 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005450 out = (unsigned short *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005451 if (byteorder == 0)
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005452 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005453 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005454 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005455
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005456 if (kind == PyUnicode_1BYTE_KIND) {
5457 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5458 goto done;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005459 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005460
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005461 if (byteorder < 0)
5462 encoding = "utf-16-le";
5463 else if (byteorder > 0)
5464 encoding = "utf-16-be";
5465 else
5466 encoding = "utf-16";
5467
5468 pos = 0;
5469 while (pos < len) {
5470 Py_ssize_t repsize, moreunits;
5471
5472 if (kind == PyUnicode_2BYTE_KIND) {
5473 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
5474 &out, native_ordering);
5475 }
5476 else {
5477 assert(kind == PyUnicode_4BYTE_KIND);
5478 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
5479 &out, native_ordering);
5480 }
5481 if (pos == len)
5482 break;
5483
5484 rep = unicode_encode_call_errorhandler(
5485 errors, &errorHandler,
5486 encoding, "surrogates not allowed",
5487 str, &exc, pos, pos + 1, &pos);
5488 if (!rep)
5489 goto error;
5490
5491 if (PyBytes_Check(rep)) {
5492 repsize = PyBytes_GET_SIZE(rep);
5493 if (repsize & 1) {
5494 raise_encode_exception(&exc, encoding,
5495 str, pos - 1, pos,
5496 "surrogates not allowed");
5497 goto error;
5498 }
5499 moreunits = repsize / 2;
5500 }
5501 else {
5502 assert(PyUnicode_Check(rep));
5503 if (PyUnicode_READY(rep) < 0)
5504 goto error;
5505 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5506 if (!PyUnicode_IS_ASCII(rep)) {
5507 raise_encode_exception(&exc, encoding,
5508 str, pos - 1, pos,
5509 "surrogates not allowed");
5510 goto error;
5511 }
5512 }
5513
5514 /* two bytes are reserved for each surrogate */
5515 if (moreunits > 1) {
5516 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
5517 Py_ssize_t morebytes = 2 * (moreunits - 1);
5518 if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) {
5519 /* integer overflow */
5520 PyErr_NoMemory();
5521 goto error;
5522 }
5523 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0)
5524 goto error;
5525 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
5526 }
5527
5528 if (PyBytes_Check(rep)) {
5529 Py_MEMCPY(out, PyBytes_AS_STRING(rep), repsize);
5530 out += moreunits;
5531 } else /* rep is unicode */ {
5532 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
5533 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5534 &out, native_ordering);
5535 }
5536
5537 Py_CLEAR(rep);
5538 }
5539
5540 /* Cut back to size actually needed. This is necessary for, for example,
5541 encoding of a string containing isolated surrogates and the 'ignore' handler
5542 is used. */
5543 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
5544 if (nsize != PyBytes_GET_SIZE(v))
5545 _PyBytes_Resize(&v, nsize);
5546 Py_XDECREF(errorHandler);
5547 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00005548 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005549 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005550 error:
5551 Py_XDECREF(rep);
5552 Py_XDECREF(errorHandler);
5553 Py_XDECREF(exc);
5554 Py_XDECREF(v);
5555 return NULL;
5556#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005557}
5558
Alexander Belopolsky40018472011-02-26 01:02:56 +00005559PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005560PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5561 Py_ssize_t size,
5562 const char *errors,
5563 int byteorder)
5564{
5565 PyObject *result;
5566 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5567 if (tmp == NULL)
5568 return NULL;
5569 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5570 Py_DECREF(tmp);
5571 return result;
5572}
5573
5574PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005575PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005576{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005577 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005578}
5579
5580/* --- Unicode Escape Codec ----------------------------------------------- */
5581
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005582/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5583 if all the escapes in the string make it still a valid ASCII string.
5584 Returns -1 if any escapes were found which cause the string to
5585 pop out of ASCII range. Otherwise returns the length of the
5586 required buffer to hold the string.
5587 */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02005588static Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005589length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5590{
5591 const unsigned char *p = (const unsigned char *)s;
5592 const unsigned char *end = p + size;
5593 Py_ssize_t length = 0;
5594
5595 if (size < 0)
5596 return -1;
5597
5598 for (; p < end; ++p) {
5599 if (*p > 127) {
5600 /* Non-ASCII */
5601 return -1;
5602 }
5603 else if (*p != '\\') {
5604 /* Normal character */
5605 ++length;
5606 }
5607 else {
5608 /* Backslash-escape, check next char */
5609 ++p;
5610 /* Escape sequence reaches till end of string or
5611 non-ASCII follow-up. */
5612 if (p >= end || *p > 127)
5613 return -1;
5614 switch (*p) {
5615 case '\n':
5616 /* backslash + \n result in zero characters */
5617 break;
5618 case '\\': case '\'': case '\"':
5619 case 'b': case 'f': case 't':
5620 case 'n': case 'r': case 'v': case 'a':
5621 ++length;
5622 break;
5623 case '0': case '1': case '2': case '3':
5624 case '4': case '5': case '6': case '7':
5625 case 'x': case 'u': case 'U': case 'N':
5626 /* these do not guarantee ASCII characters */
5627 return -1;
5628 default:
5629 /* count the backslash + the other character */
5630 length += 2;
5631 }
5632 }
5633 }
5634 return length;
5635}
5636
Fredrik Lundh06d12682001-01-24 07:59:11 +00005637static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005638
Alexander Belopolsky40018472011-02-26 01:02:56 +00005639PyObject *
5640PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005641 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005642 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005643{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005644 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005645 Py_ssize_t startinpos;
5646 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005647 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005648 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005649 char* message;
5650 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005651 PyObject *errorHandler = NULL;
5652 PyObject *exc = NULL;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005653 Py_ssize_t len;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005654
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005655 len = length_of_escaped_ascii_string(s, size);
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005656 if (len == 0)
5657 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005658
5659 /* After length_of_escaped_ascii_string() there are two alternatives,
5660 either the string is pure ASCII with named escapes like \n, etc.
5661 and we determined it's exact size (common case)
5662 or it contains \x, \u, ... escape sequences. then we create a
5663 legacy wchar string and resize it at the end of this function. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005664 _PyUnicodeWriter_Init(&writer);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005665 if (len > 0) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02005666 writer.min_length = len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005667 }
5668 else {
5669 /* Escaped strings will always be longer than the resulting
5670 Unicode string, so we start with size here and then reduce the
5671 length after conversion to the true value.
5672 (but if the error callback returns a long replacement string
5673 we'll have to allocate more space) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005674 writer.min_length = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005675 }
5676
Guido van Rossumd57fd912000-03-10 22:53:23 +00005677 if (size == 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005678 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005679 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005680
Guido van Rossumd57fd912000-03-10 22:53:23 +00005681 while (s < end) {
5682 unsigned char c;
Victor Stinner24729f32011-11-10 20:31:37 +01005683 Py_UCS4 x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005684 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005685
5686 /* Non-escape characters are interpreted as Unicode ordinals */
5687 if (*s != '\\') {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005688 x = (unsigned char)*s;
5689 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005690 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005691 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005692 continue;
5693 }
5694
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005695 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005696 /* \ - Escapes */
5697 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005698 c = *s++;
5699 if (s > end)
5700 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005701
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005702 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005703
Benjamin Peterson29060642009-01-31 22:14:21 +00005704 /* \x escapes */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005705#define WRITECHAR(ch) \
5706 do { \
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005707 if (_PyUnicodeWriter_WriteCharInline(&writer, (ch)) < 0) \
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005708 goto onError; \
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005709 } while(0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005710
Guido van Rossumd57fd912000-03-10 22:53:23 +00005711 case '\n': break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005712 case '\\': WRITECHAR('\\'); break;
5713 case '\'': WRITECHAR('\''); break;
5714 case '\"': WRITECHAR('\"'); break;
5715 case 'b': WRITECHAR('\b'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005716 /* FF */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005717 case 'f': WRITECHAR('\014'); break;
5718 case 't': WRITECHAR('\t'); break;
5719 case 'n': WRITECHAR('\n'); break;
5720 case 'r': WRITECHAR('\r'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005721 /* VT */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005722 case 'v': WRITECHAR('\013'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005723 /* BEL, not classic C */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005724 case 'a': WRITECHAR('\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005725
Benjamin Peterson29060642009-01-31 22:14:21 +00005726 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005727 case '0': case '1': case '2': case '3':
5728 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005729 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005730 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005731 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005732 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005733 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005734 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005735 WRITECHAR(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005736 break;
5737
Benjamin Peterson29060642009-01-31 22:14:21 +00005738 /* hex escapes */
5739 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005740 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005741 digits = 2;
5742 message = "truncated \\xXX escape";
5743 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005744
Benjamin Peterson29060642009-01-31 22:14:21 +00005745 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005746 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005747 digits = 4;
5748 message = "truncated \\uXXXX escape";
5749 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005750
Benjamin Peterson29060642009-01-31 22:14:21 +00005751 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005752 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005753 digits = 8;
5754 message = "truncated \\UXXXXXXXX escape";
5755 hexescape:
5756 chr = 0;
Serhiy Storchakad6793772013-01-29 10:20:44 +02005757 if (end - s < digits) {
5758 /* count only hex digits */
5759 for (; s < end; ++s) {
5760 c = (unsigned char)*s;
5761 if (!Py_ISXDIGIT(c))
5762 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005763 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02005764 goto error;
5765 }
5766 for (; digits--; ++s) {
5767 c = (unsigned char)*s;
5768 if (!Py_ISXDIGIT(c))
5769 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005770 chr = (chr<<4) & ~0xF;
5771 if (c >= '0' && c <= '9')
5772 chr += c - '0';
5773 else if (c >= 'a' && c <= 'f')
5774 chr += 10 + c - 'a';
5775 else
5776 chr += 10 + c - 'A';
5777 }
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005778 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005779 /* _decoding_error will have already written into the
5780 target buffer. */
5781 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005782 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005783 /* when we get here, chr is a 32-bit unicode character */
Serhiy Storchaka24193de2013-01-29 10:28:07 +02005784 message = "illegal Unicode character";
5785 if (chr > MAX_UNICODE)
Serhiy Storchakad6793772013-01-29 10:20:44 +02005786 goto error;
Serhiy Storchaka24193de2013-01-29 10:28:07 +02005787 WRITECHAR(chr);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005788 break;
5789
Benjamin Peterson29060642009-01-31 22:14:21 +00005790 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005791 case 'N':
5792 message = "malformed \\N character escape";
5793 if (ucnhash_CAPI == NULL) {
5794 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005795 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5796 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005797 if (ucnhash_CAPI == NULL)
5798 goto ucnhashError;
5799 }
5800 if (*s == '{') {
5801 const char *start = s+1;
5802 /* look for the closing brace */
5803 while (*s != '}' && s < end)
5804 s++;
5805 if (s > start && s < end && *s == '}') {
5806 /* found a name. look it up in the unicode database */
5807 message = "unknown Unicode character name";
5808 s++;
Serhiy Storchaka4f5f0e52013-01-21 11:38:00 +02005809 if (s - start - 1 <= INT_MAX &&
Serhiy Storchakac35f3a92013-01-21 11:42:57 +02005810 ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
Ezio Melotti931b8aa2011-10-21 21:57:36 +03005811 &chr, 0))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005812 goto store;
5813 }
5814 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02005815 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005816
5817 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005818 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005819 message = "\\ at end of string";
5820 s--;
Serhiy Storchakad6793772013-01-29 10:20:44 +02005821 goto error;
Walter Dörwald8c077222002-03-25 11:16:18 +00005822 }
5823 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005824 WRITECHAR('\\');
Serhiy Storchaka73e38802013-01-25 23:52:21 +02005825 WRITECHAR((unsigned char)s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005826 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005827 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005828 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02005829 continue;
5830
5831 error:
5832 endinpos = s-starts;
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02005833 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchakad6793772013-01-29 10:20:44 +02005834 errors, &errorHandler,
5835 "unicodeescape", message,
5836 &starts, &end, &startinpos, &endinpos, &exc, &s,
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02005837 &writer))
Serhiy Storchakad6793772013-01-29 10:20:44 +02005838 goto onError;
5839 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005840 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005841#undef WRITECHAR
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005842
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005843 Py_XDECREF(errorHandler);
5844 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005845 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald8c077222002-03-25 11:16:18 +00005846
Benjamin Peterson29060642009-01-31 22:14:21 +00005847 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005848 PyErr_SetString(
5849 PyExc_UnicodeError,
5850 "\\N escapes not supported (can't load unicodedata module)"
5851 );
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005852 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005853 Py_XDECREF(errorHandler);
5854 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005855 return NULL;
5856
Benjamin Peterson29060642009-01-31 22:14:21 +00005857 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005858 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005859 Py_XDECREF(errorHandler);
5860 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005861 return NULL;
5862}
5863
5864/* Return a Unicode-Escape string version of the Unicode object.
5865
5866 If quotes is true, the string is enclosed in u"" or u'' quotes as
5867 appropriate.
5868
5869*/
5870
Alexander Belopolsky40018472011-02-26 01:02:56 +00005871PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005872PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005873{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005874 Py_ssize_t i, len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005875 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005876 char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005877 int kind;
5878 void *data;
5879 Py_ssize_t expandsize = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005880
Ezio Melottie7f90372012-10-05 03:33:31 +03005881 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00005882 escape.
5883
Ezio Melottie7f90372012-10-05 03:33:31 +03005884 For UCS1 strings it's '\xxx', 4 bytes per source character.
5885 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
5886 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00005887 */
5888
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005889 if (!PyUnicode_Check(unicode)) {
5890 PyErr_BadArgument();
5891 return NULL;
5892 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005893 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005894 return NULL;
5895 len = PyUnicode_GET_LENGTH(unicode);
5896 kind = PyUnicode_KIND(unicode);
5897 data = PyUnicode_DATA(unicode);
Benjamin Petersonead6b532011-12-20 17:23:42 -06005898 switch (kind) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005899 case PyUnicode_1BYTE_KIND: expandsize = 4; break;
5900 case PyUnicode_2BYTE_KIND: expandsize = 6; break;
5901 case PyUnicode_4BYTE_KIND: expandsize = 10; break;
5902 }
5903
5904 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005905 return PyBytes_FromStringAndSize(NULL, 0);
5906
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005907 if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005908 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005909
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005910 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005911 2
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005912 + expandsize*len
Benjamin Peterson29060642009-01-31 22:14:21 +00005913 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005914 if (repr == NULL)
5915 return NULL;
5916
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005917 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005918
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005919 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01005920 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005921
Walter Dörwald79e913e2007-05-12 11:08:06 +00005922 /* Escape backslashes */
5923 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005924 *p++ = '\\';
5925 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005926 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005927 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005928
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005929 /* Map 21-bit characters to '\U00xxxxxx' */
5930 else if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01005931 assert(ch <= MAX_UNICODE);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005932 *p++ = '\\';
5933 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005934 *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
5935 *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F];
5936 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
5937 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
5938 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
5939 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
5940 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
5941 *p++ = Py_hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00005942 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005943 }
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005944
Guido van Rossumd57fd912000-03-10 22:53:23 +00005945 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005946 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005947 *p++ = '\\';
5948 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005949 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
5950 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
5951 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5952 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005953 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005954
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005955 /* Map special whitespace to '\t', \n', '\r' */
5956 else if (ch == '\t') {
5957 *p++ = '\\';
5958 *p++ = 't';
5959 }
5960 else if (ch == '\n') {
5961 *p++ = '\\';
5962 *p++ = 'n';
5963 }
5964 else if (ch == '\r') {
5965 *p++ = '\\';
5966 *p++ = 'r';
5967 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005968
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005969 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00005970 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005971 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005972 *p++ = 'x';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005973 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5974 *p++ = Py_hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00005975 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005976
Guido van Rossumd57fd912000-03-10 22:53:23 +00005977 /* Copy everything else as-is */
5978 else
5979 *p++ = (char) ch;
5980 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005981
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005982 assert(p - PyBytes_AS_STRING(repr) > 0);
5983 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
5984 return NULL;
5985 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005986}
5987
Alexander Belopolsky40018472011-02-26 01:02:56 +00005988PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005989PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
5990 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005991{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005992 PyObject *result;
5993 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5994 if (tmp == NULL)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005995 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005996 result = PyUnicode_AsUnicodeEscapeString(tmp);
5997 Py_DECREF(tmp);
5998 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005999}
6000
6001/* --- Raw Unicode Escape Codec ------------------------------------------- */
6002
Alexander Belopolsky40018472011-02-26 01:02:56 +00006003PyObject *
6004PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006005 Py_ssize_t size,
6006 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006007{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006008 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006009 Py_ssize_t startinpos;
6010 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006011 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006012 const char *end;
6013 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006014 PyObject *errorHandler = NULL;
6015 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006016
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006017 if (size == 0)
6018 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006019
Guido van Rossumd57fd912000-03-10 22:53:23 +00006020 /* Escaped strings will always be longer than the resulting
6021 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006022 length after conversion to the true value. (But decoding error
6023 handler might have to resize the string) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006024 _PyUnicodeWriter_Init(&writer);
6025 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006026
Guido van Rossumd57fd912000-03-10 22:53:23 +00006027 end = s + size;
6028 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006029 unsigned char c;
6030 Py_UCS4 x;
6031 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006032 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006033
Benjamin Peterson29060642009-01-31 22:14:21 +00006034 /* Non-escape characters are interpreted as Unicode ordinals */
6035 if (*s != '\\') {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006036 x = (unsigned char)*s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006037 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006038 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006039 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006040 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006041 startinpos = s-starts;
6042
6043 /* \u-escapes are only interpreted iff the number of leading
6044 backslashes if odd */
6045 bs = s;
6046 for (;s < end;) {
6047 if (*s != '\\')
6048 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006049 x = (unsigned char)*s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006050 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006051 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006052 }
6053 if (((s - bs) & 1) == 0 ||
6054 s >= end ||
6055 (*s != 'u' && *s != 'U')) {
6056 continue;
6057 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006058 writer.pos--;
Benjamin Peterson29060642009-01-31 22:14:21 +00006059 count = *s=='u' ? 4 : 8;
6060 s++;
6061
6062 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Benjamin Peterson29060642009-01-31 22:14:21 +00006063 for (x = 0, i = 0; i < count; ++i, ++s) {
6064 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00006065 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006066 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006067 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00006068 errors, &errorHandler,
6069 "rawunicodeescape", "truncated \\uXXXX",
6070 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006071 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006072 goto onError;
6073 goto nextByte;
6074 }
6075 x = (x<<4) & ~0xF;
6076 if (c >= '0' && c <= '9')
6077 x += c - '0';
6078 else if (c >= 'a' && c <= 'f')
6079 x += 10 + c - 'a';
6080 else
6081 x += 10 + c - 'A';
6082 }
Victor Stinner8faf8212011-12-08 22:14:11 +01006083 if (x <= MAX_UNICODE) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006084 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006085 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006086 }
6087 else {
Christian Heimesfe337bf2008-03-23 21:54:12 +00006088 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006089 if (unicode_decode_call_errorhandler_writer(
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006090 errors, &errorHandler,
6091 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00006092 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006093 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006094 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006095 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006096 nextByte:
6097 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006098 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006099 Py_XDECREF(errorHandler);
6100 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006101 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006102
Benjamin Peterson29060642009-01-31 22:14:21 +00006103 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006104 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006105 Py_XDECREF(errorHandler);
6106 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006107 return NULL;
6108}
6109
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006110
Alexander Belopolsky40018472011-02-26 01:02:56 +00006111PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006112PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006113{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006114 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006115 char *p;
6116 char *q;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006117 Py_ssize_t expandsize, pos;
6118 int kind;
6119 void *data;
6120 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006121
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006122 if (!PyUnicode_Check(unicode)) {
6123 PyErr_BadArgument();
6124 return NULL;
6125 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05006126 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006127 return NULL;
6128 kind = PyUnicode_KIND(unicode);
6129 data = PyUnicode_DATA(unicode);
6130 len = PyUnicode_GET_LENGTH(unicode);
Benjamin Peterson1518e872011-11-23 10:44:52 -06006131 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6132 bytes, and 1 byte characters 4. */
6133 expandsize = kind * 2 + 2;
Victor Stinner0e368262011-11-10 20:12:49 +01006134
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006135 if (len > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006136 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00006137
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006138 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006139 if (repr == NULL)
6140 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006141 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006142 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006143
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006144 p = q = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006145 for (pos = 0; pos < len; pos++) {
6146 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006147 /* Map 32-bit characters to '\Uxxxxxxxx' */
6148 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01006149 assert(ch <= MAX_UNICODE);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006150 *p++ = '\\';
6151 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006152 *p++ = Py_hexdigits[(ch >> 28) & 0xf];
6153 *p++ = Py_hexdigits[(ch >> 24) & 0xf];
6154 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6155 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6156 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6157 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6158 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6159 *p++ = Py_hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00006160 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006161 /* Map 16-bit characters to '\uxxxx' */
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006162 else if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006163 *p++ = '\\';
6164 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006165 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6166 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6167 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6168 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006169 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006170 /* Copy everything else as-is */
6171 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00006172 *p++ = (char) ch;
6173 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006174
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006175 assert(p > q);
6176 if (_PyBytes_Resize(&repr, p - q) < 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006177 return NULL;
6178 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006179}
6180
Alexander Belopolsky40018472011-02-26 01:02:56 +00006181PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006182PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6183 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006184{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006185 PyObject *result;
6186 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6187 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006188 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006189 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6190 Py_DECREF(tmp);
6191 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006192}
6193
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006194/* --- Unicode Internal Codec ------------------------------------------- */
6195
Alexander Belopolsky40018472011-02-26 01:02:56 +00006196PyObject *
6197_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006198 Py_ssize_t size,
6199 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006200{
6201 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006202 Py_ssize_t startinpos;
6203 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006204 _PyUnicodeWriter writer;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006205 const char *end;
6206 const char *reason;
6207 PyObject *errorHandler = NULL;
6208 PyObject *exc = NULL;
6209
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006210 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02006211 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006212 1))
6213 return NULL;
6214
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006215 if (size == 0)
6216 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006217
Victor Stinner8f674cc2013-04-17 23:02:17 +02006218 _PyUnicodeWriter_Init(&writer);
6219 if (size / Py_UNICODE_SIZE > PY_SSIZE_T_MAX - 1) {
6220 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00006221 goto onError;
Victor Stinner8f674cc2013-04-17 23:02:17 +02006222 }
6223 writer.min_length = (size + (Py_UNICODE_SIZE - 1)) / Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006224
Victor Stinner8f674cc2013-04-17 23:02:17 +02006225 end = s + size;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006226 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006227 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006228 Py_UCS4 ch;
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006229 if (end - s < Py_UNICODE_SIZE) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006230 endinpos = end-starts;
6231 reason = "truncated input";
6232 goto error;
6233 }
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006234 /* We copy the raw representation one byte at a time because the
6235 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006236 ((char *) &uch)[0] = s[0];
6237 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006238#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006239 ((char *) &uch)[2] = s[2];
6240 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006241#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006242 ch = uch;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006243#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006244 /* We have to sanity check the raw data, otherwise doom looms for
6245 some malformed UCS-4 data. */
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006246 if (ch > 0x10ffff) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006247 endinpos = s - starts + Py_UNICODE_SIZE;
6248 reason = "illegal code point (> 0x10FFFF)";
6249 goto error;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006250 }
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006251#endif
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006252 s += Py_UNICODE_SIZE;
6253#ifndef Py_UNICODE_WIDE
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006254 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && end - s >= Py_UNICODE_SIZE)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006255 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006256 Py_UNICODE uch2;
6257 ((char *) &uch2)[0] = s[0];
6258 ((char *) &uch2)[1] = s[1];
Victor Stinner551ac952011-11-29 22:58:13 +01006259 if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006260 {
Victor Stinner551ac952011-11-29 22:58:13 +01006261 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006262 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006263 }
6264 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006265#endif
6266
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006267 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006268 goto onError;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006269 continue;
6270
6271 error:
6272 startinpos = s - starts;
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006273 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006274 errors, &errorHandler,
6275 "unicode_internal", reason,
6276 &starts, &end, &startinpos, &endinpos, &exc, &s,
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006277 &writer))
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006278 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006279 }
6280
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006281 Py_XDECREF(errorHandler);
6282 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006283 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006284
Benjamin Peterson29060642009-01-31 22:14:21 +00006285 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006286 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006287 Py_XDECREF(errorHandler);
6288 Py_XDECREF(exc);
6289 return NULL;
6290}
6291
Guido van Rossumd57fd912000-03-10 22:53:23 +00006292/* --- Latin-1 Codec ------------------------------------------------------ */
6293
Alexander Belopolsky40018472011-02-26 01:02:56 +00006294PyObject *
6295PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006296 Py_ssize_t size,
6297 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006298{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006299 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006300 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006301}
6302
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006303/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006304static void
6305make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006306 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006307 PyObject *unicode,
6308 Py_ssize_t startpos, Py_ssize_t endpos,
6309 const char *reason)
6310{
6311 if (*exceptionObject == NULL) {
6312 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006313 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006314 encoding, unicode, startpos, endpos, reason);
6315 }
6316 else {
6317 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6318 goto onError;
6319 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6320 goto onError;
6321 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6322 goto onError;
6323 return;
6324 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02006325 Py_CLEAR(*exceptionObject);
Martin v. Löwis9e816682011-11-02 12:45:42 +01006326 }
6327}
6328
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006329/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006330static void
6331raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006332 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006333 PyObject *unicode,
6334 Py_ssize_t startpos, Py_ssize_t endpos,
6335 const char *reason)
6336{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006337 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006338 encoding, unicode, startpos, endpos, reason);
6339 if (*exceptionObject != NULL)
6340 PyCodec_StrictErrors(*exceptionObject);
6341}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006342
6343/* error handling callback helper:
6344 build arguments, call the callback and check the arguments,
6345 put the result into newpos and return the replacement string, which
6346 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006347static PyObject *
6348unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006349 PyObject **errorHandler,
6350 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006351 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006352 Py_ssize_t startpos, Py_ssize_t endpos,
6353 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006354{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006355 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006356 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006357 PyObject *restuple;
6358 PyObject *resunicode;
6359
6360 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006361 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006362 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006363 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006364 }
6365
Benjamin Petersonbac79492012-01-14 13:34:47 -05006366 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006367 return NULL;
6368 len = PyUnicode_GET_LENGTH(unicode);
6369
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006370 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006371 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006372 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006373 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006374
6375 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006376 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006377 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006378 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006379 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006380 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006381 Py_DECREF(restuple);
6382 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006383 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006384 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006385 &resunicode, newpos)) {
6386 Py_DECREF(restuple);
6387 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006388 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006389 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6390 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6391 Py_DECREF(restuple);
6392 return NULL;
6393 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006394 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006395 *newpos = len + *newpos;
6396 if (*newpos<0 || *newpos>len) {
Victor Stinnera33bce02014-07-04 22:47:46 +02006397 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006398 Py_DECREF(restuple);
6399 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006400 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006401 Py_INCREF(resunicode);
6402 Py_DECREF(restuple);
6403 return resunicode;
6404}
6405
Alexander Belopolsky40018472011-02-26 01:02:56 +00006406static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006407unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006408 const char *errors,
Victor Stinnerfcd96532011-11-04 00:28:50 +01006409 unsigned int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006410{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006411 /* input state */
6412 Py_ssize_t pos=0, size;
6413 int kind;
6414 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006415 /* output object */
6416 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006417 /* pointer into the output */
6418 char *str;
6419 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006420 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006421 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6422 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006423 PyObject *errorHandler = NULL;
6424 PyObject *exc = NULL;
6425 /* the following variable is used for caching string comparisons
6426 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6427 int known_errorHandler = -1;
6428
Benjamin Petersonbac79492012-01-14 13:34:47 -05006429 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006430 return NULL;
6431 size = PyUnicode_GET_LENGTH(unicode);
6432 kind = PyUnicode_KIND(unicode);
6433 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006434 /* allocate enough for a simple encoding without
6435 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006436 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006437 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006438 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006439 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006440 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006441 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006442 ressize = size;
6443
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006444 while (pos < size) {
6445 Py_UCS4 c = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006446
Benjamin Peterson29060642009-01-31 22:14:21 +00006447 /* can we encode this? */
6448 if (c<limit) {
6449 /* no overflow check, because we know that the space is enough */
6450 *str++ = (char)c;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006451 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006452 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006453 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006454 Py_ssize_t requiredsize;
6455 PyObject *repunicode;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006456 Py_ssize_t repsize, newpos, respos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006457 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006458 Py_ssize_t collstart = pos;
6459 Py_ssize_t collend = pos;
Benjamin Peterson29060642009-01-31 22:14:21 +00006460 /* find all unecodable characters */
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006461 while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006462 ++collend;
6463 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6464 if (known_errorHandler==-1) {
6465 if ((errors==NULL) || (!strcmp(errors, "strict")))
6466 known_errorHandler = 1;
6467 else if (!strcmp(errors, "replace"))
6468 known_errorHandler = 2;
6469 else if (!strcmp(errors, "ignore"))
6470 known_errorHandler = 3;
6471 else if (!strcmp(errors, "xmlcharrefreplace"))
6472 known_errorHandler = 4;
6473 else
6474 known_errorHandler = 0;
6475 }
6476 switch (known_errorHandler) {
6477 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006478 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006479 goto onError;
6480 case 2: /* replace */
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006481 while (collstart++ < collend)
Benjamin Peterson29060642009-01-31 22:14:21 +00006482 *str++ = '?'; /* fall through */
6483 case 3: /* ignore */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006484 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006485 break;
6486 case 4: /* xmlcharrefreplace */
6487 respos = str - PyBytes_AS_STRING(res);
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006488 requiredsize = respos;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006489 /* determine replacement size */
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006490 for (i = collstart; i < collend; ++i) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006491 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006492 Py_ssize_t incr;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006493 if (ch < 10)
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006494 incr = 2+1+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006495 else if (ch < 100)
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006496 incr = 2+2+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006497 else if (ch < 1000)
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006498 incr = 2+3+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006499 else if (ch < 10000)
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006500 incr = 2+4+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006501 else if (ch < 100000)
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006502 incr = 2+5+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006503 else if (ch < 1000000)
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006504 incr = 2+6+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006505 else {
Victor Stinner8faf8212011-12-08 22:14:11 +01006506 assert(ch <= MAX_UNICODE);
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006507 incr = 2+7+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006508 }
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006509 if (requiredsize > PY_SSIZE_T_MAX - incr)
6510 goto overflow;
6511 requiredsize += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +00006512 }
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006513 if (requiredsize > PY_SSIZE_T_MAX - (size - collend))
6514 goto overflow;
6515 requiredsize += size - collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006516 if (requiredsize > ressize) {
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006517 if (ressize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*ressize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006518 requiredsize = 2*ressize;
6519 if (_PyBytes_Resize(&res, requiredsize))
6520 goto onError;
6521 str = PyBytes_AS_STRING(res) + respos;
6522 ressize = requiredsize;
6523 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006524 /* generate replacement */
6525 for (i = collstart; i < collend; ++i) {
6526 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
Benjamin Peterson29060642009-01-31 22:14:21 +00006527 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006528 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006529 break;
6530 default:
6531 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006532 encoding, reason, unicode, &exc,
6533 collstart, collend, &newpos);
6534 if (repunicode == NULL || (PyUnicode_Check(repunicode) &&
Benjamin Petersonbac79492012-01-14 13:34:47 -05006535 PyUnicode_READY(repunicode) == -1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006536 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006537 if (PyBytes_Check(repunicode)) {
6538 /* Directly copy bytes result to output. */
6539 repsize = PyBytes_Size(repunicode);
6540 if (repsize > 1) {
6541 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006542 respos = str - PyBytes_AS_STRING(res);
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006543 if (ressize > PY_SSIZE_T_MAX - repsize - 1) {
6544 Py_DECREF(repunicode);
6545 goto overflow;
6546 }
Martin v. Löwis011e8422009-05-05 04:43:17 +00006547 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6548 Py_DECREF(repunicode);
6549 goto onError;
6550 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006551 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006552 ressize += repsize-1;
6553 }
6554 memcpy(str, PyBytes_AsString(repunicode), repsize);
6555 str += repsize;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006556 pos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006557 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006558 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006559 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006560 /* need more space? (at least enough for what we
6561 have+the replacement+the rest of the string, so
6562 we won't have to check space for encodable characters) */
6563 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006564 repsize = PyUnicode_GET_LENGTH(repunicode);
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006565 requiredsize = respos;
6566 if (requiredsize > PY_SSIZE_T_MAX - repsize)
6567 goto overflow;
6568 requiredsize += repsize;
6569 if (requiredsize > PY_SSIZE_T_MAX - (size - collend))
6570 goto overflow;
6571 requiredsize += size - collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006572 if (requiredsize > ressize) {
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006573 if (ressize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*ressize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006574 requiredsize = 2*ressize;
6575 if (_PyBytes_Resize(&res, requiredsize)) {
6576 Py_DECREF(repunicode);
6577 goto onError;
6578 }
6579 str = PyBytes_AS_STRING(res) + respos;
6580 ressize = requiredsize;
6581 }
6582 /* check if there is anything unencodable in the replacement
6583 and copy it to the output */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006584 for (i = 0; repsize-->0; ++i, ++str) {
6585 c = PyUnicode_READ_CHAR(repunicode, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00006586 if (c >= limit) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006587 raise_encode_exception(&exc, encoding, unicode,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006588 pos, pos+1, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006589 Py_DECREF(repunicode);
6590 goto onError;
6591 }
6592 *str = (char)c;
6593 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006594 pos = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006595 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006596 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006597 }
6598 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006599 /* Resize if we allocated to much */
6600 size = str - PyBytes_AS_STRING(res);
6601 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006602 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006603 if (_PyBytes_Resize(&res, size) < 0)
6604 goto onError;
6605 }
6606
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006607 Py_XDECREF(errorHandler);
6608 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006609 return res;
6610
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006611 overflow:
6612 PyErr_SetString(PyExc_OverflowError,
6613 "encoded result is too long for a Python string");
6614
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006615 onError:
6616 Py_XDECREF(res);
6617 Py_XDECREF(errorHandler);
6618 Py_XDECREF(exc);
6619 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006620}
6621
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006622/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006623PyObject *
6624PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006625 Py_ssize_t size,
6626 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006627{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006628 PyObject *result;
6629 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6630 if (unicode == NULL)
6631 return NULL;
6632 result = unicode_encode_ucs1(unicode, errors, 256);
6633 Py_DECREF(unicode);
6634 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006635}
6636
Alexander Belopolsky40018472011-02-26 01:02:56 +00006637PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006638_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006639{
6640 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006641 PyErr_BadArgument();
6642 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006643 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006644 if (PyUnicode_READY(unicode) == -1)
6645 return NULL;
6646 /* Fast path: if it is a one-byte string, construct
6647 bytes object directly. */
6648 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6649 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6650 PyUnicode_GET_LENGTH(unicode));
6651 /* Non-Latin-1 characters present. Defer to above function to
6652 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006653 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006654}
6655
6656PyObject*
6657PyUnicode_AsLatin1String(PyObject *unicode)
6658{
6659 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006660}
6661
6662/* --- 7-bit ASCII Codec -------------------------------------------------- */
6663
Alexander Belopolsky40018472011-02-26 01:02:56 +00006664PyObject *
6665PyUnicode_DecodeASCII(const char *s,
6666 Py_ssize_t size,
6667 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006668{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006669 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006670 _PyUnicodeWriter writer;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006671 int kind;
6672 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006673 Py_ssize_t startinpos;
6674 Py_ssize_t endinpos;
6675 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006676 const char *e;
6677 PyObject *errorHandler = NULL;
6678 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006679
Guido van Rossumd57fd912000-03-10 22:53:23 +00006680 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02006681 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006682
Guido van Rossumd57fd912000-03-10 22:53:23 +00006683 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006684 if (size == 1 && (unsigned char)s[0] < 128)
6685 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006686
Victor Stinner8f674cc2013-04-17 23:02:17 +02006687 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02006688 writer.min_length = size;
6689 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0)
Victor Stinner8f674cc2013-04-17 23:02:17 +02006690 return NULL;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006691
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006692 e = s + size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006693 data = writer.data;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006694 outpos = ascii_decode(s, e, (Py_UCS1 *)data);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006695 writer.pos = outpos;
6696 if (writer.pos == size)
6697 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006698
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006699 s += writer.pos;
6700 kind = writer.kind;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006701 while (s < e) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02006702 unsigned char c = (unsigned char)*s;
Benjamin Peterson29060642009-01-31 22:14:21 +00006703 if (c < 128) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006704 PyUnicode_WRITE(kind, data, writer.pos, c);
6705 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00006706 ++s;
6707 }
6708 else {
6709 startinpos = s-starts;
6710 endinpos = startinpos + 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006711 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00006712 errors, &errorHandler,
6713 "ascii", "ordinal not in range(128)",
6714 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006715 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006716 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006717 kind = writer.kind;
6718 data = writer.data;
Benjamin Peterson29060642009-01-31 22:14:21 +00006719 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006720 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006721 Py_XDECREF(errorHandler);
6722 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006723 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006724
Benjamin Peterson29060642009-01-31 22:14:21 +00006725 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006726 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006727 Py_XDECREF(errorHandler);
6728 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006729 return NULL;
6730}
6731
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006732/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006733PyObject *
6734PyUnicode_EncodeASCII(const Py_UNICODE *p,
6735 Py_ssize_t size,
6736 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006737{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006738 PyObject *result;
6739 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6740 if (unicode == NULL)
6741 return NULL;
6742 result = unicode_encode_ucs1(unicode, errors, 128);
6743 Py_DECREF(unicode);
6744 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006745}
6746
Alexander Belopolsky40018472011-02-26 01:02:56 +00006747PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006748_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006749{
6750 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006751 PyErr_BadArgument();
6752 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006753 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006754 if (PyUnicode_READY(unicode) == -1)
6755 return NULL;
6756 /* Fast path: if it is an ASCII-only string, construct bytes object
6757 directly. Else defer to above function to raise the exception. */
Victor Stinneraf037572013-04-14 18:44:10 +02006758 if (PyUnicode_IS_ASCII(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006759 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6760 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006761 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006762}
6763
6764PyObject *
6765PyUnicode_AsASCIIString(PyObject *unicode)
6766{
6767 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006768}
6769
Victor Stinner99b95382011-07-04 14:23:54 +02006770#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006771
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006772/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006773
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006774#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006775#define NEED_RETRY
6776#endif
6777
Victor Stinner3a50e702011-10-18 21:21:00 +02006778#ifndef WC_ERR_INVALID_CHARS
6779# define WC_ERR_INVALID_CHARS 0x0080
6780#endif
6781
6782static char*
6783code_page_name(UINT code_page, PyObject **obj)
6784{
6785 *obj = NULL;
6786 if (code_page == CP_ACP)
6787 return "mbcs";
6788 if (code_page == CP_UTF7)
6789 return "CP_UTF7";
6790 if (code_page == CP_UTF8)
6791 return "CP_UTF8";
6792
6793 *obj = PyBytes_FromFormat("cp%u", code_page);
6794 if (*obj == NULL)
6795 return NULL;
6796 return PyBytes_AS_STRING(*obj);
6797}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006798
Victor Stinner3a50e702011-10-18 21:21:00 +02006799static DWORD
6800decode_code_page_flags(UINT code_page)
6801{
6802 if (code_page == CP_UTF7) {
6803 /* The CP_UTF7 decoder only supports flags=0 */
6804 return 0;
6805 }
6806 else
6807 return MB_ERR_INVALID_CHARS;
6808}
6809
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006810/*
Victor Stinner3a50e702011-10-18 21:21:00 +02006811 * Decode a byte string from a Windows code page into unicode object in strict
6812 * mode.
6813 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02006814 * Returns consumed size if succeed, returns -2 on decode error, or raise an
6815 * OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006816 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006817static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006818decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006819 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02006820 const char *in,
6821 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006822{
Victor Stinner3a50e702011-10-18 21:21:00 +02006823 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01006824 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02006825 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006826
6827 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02006828 assert(insize > 0);
6829 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
6830 if (outsize <= 0)
6831 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006832
6833 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006834 /* Create unicode object */
Victor Stinnerab595942011-12-17 04:59:06 +01006835 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01006836 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00006837 if (*v == NULL)
6838 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006839 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006840 }
6841 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006842 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02006843 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner16e6a802011-12-12 13:24:15 +01006844 if (unicode_resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006845 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006846 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006847 }
6848
6849 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02006850 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
6851 if (outsize <= 0)
6852 goto error;
6853 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00006854
Victor Stinner3a50e702011-10-18 21:21:00 +02006855error:
6856 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
6857 return -2;
6858 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00006859 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006860}
6861
Victor Stinner3a50e702011-10-18 21:21:00 +02006862/*
6863 * Decode a byte string from a code page into unicode object with an error
6864 * handler.
6865 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02006866 * Returns consumed size if succeed, or raise an OSError or
Victor Stinner3a50e702011-10-18 21:21:00 +02006867 * UnicodeDecodeError exception and returns -1 on error.
6868 */
6869static int
6870decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006871 PyObject **v,
6872 const char *in, const int size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01006873 const char *errors, int final)
Victor Stinner3a50e702011-10-18 21:21:00 +02006874{
6875 const char *startin = in;
6876 const char *endin = in + size;
6877 const DWORD flags = decode_code_page_flags(code_page);
6878 /* Ideally, we should get reason from FormatMessage. This is the Windows
6879 2000 English version of the message. */
6880 const char *reason = "No mapping for the Unicode character exists "
6881 "in the target code page.";
6882 /* each step cannot decode more than 1 character, but a character can be
6883 represented as a surrogate pair */
6884 wchar_t buffer[2], *startout, *out;
Victor Stinner9f067f42013-06-05 00:21:31 +02006885 int insize;
6886 Py_ssize_t outsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02006887 PyObject *errorHandler = NULL;
6888 PyObject *exc = NULL;
6889 PyObject *encoding_obj = NULL;
6890 char *encoding;
6891 DWORD err;
6892 int ret = -1;
6893
6894 assert(size > 0);
6895
6896 encoding = code_page_name(code_page, &encoding_obj);
6897 if (encoding == NULL)
6898 return -1;
6899
Victor Stinner7d00cc12014-03-17 23:08:06 +01006900 if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
Victor Stinner3a50e702011-10-18 21:21:00 +02006901 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
6902 UnicodeDecodeError. */
6903 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
6904 if (exc != NULL) {
6905 PyCodec_StrictErrors(exc);
6906 Py_CLEAR(exc);
6907 }
6908 goto error;
6909 }
6910
6911 if (*v == NULL) {
6912 /* Create unicode object */
6913 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6914 PyErr_NoMemory();
6915 goto error;
6916 }
Victor Stinnerab595942011-12-17 04:59:06 +01006917 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01006918 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02006919 if (*v == NULL)
6920 goto error;
6921 startout = PyUnicode_AS_UNICODE(*v);
6922 }
6923 else {
6924 /* Extend unicode object */
6925 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
6926 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6927 PyErr_NoMemory();
6928 goto error;
6929 }
Victor Stinner16e6a802011-12-12 13:24:15 +01006930 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02006931 goto error;
6932 startout = PyUnicode_AS_UNICODE(*v) + n;
6933 }
6934
6935 /* Decode the byte string character per character */
6936 out = startout;
6937 while (in < endin)
6938 {
6939 /* Decode a character */
6940 insize = 1;
6941 do
6942 {
6943 outsize = MultiByteToWideChar(code_page, flags,
6944 in, insize,
6945 buffer, Py_ARRAY_LENGTH(buffer));
6946 if (outsize > 0)
6947 break;
6948 err = GetLastError();
6949 if (err != ERROR_NO_UNICODE_TRANSLATION
6950 && err != ERROR_INSUFFICIENT_BUFFER)
6951 {
6952 PyErr_SetFromWindowsErr(0);
6953 goto error;
6954 }
6955 insize++;
6956 }
6957 /* 4=maximum length of a UTF-8 sequence */
6958 while (insize <= 4 && (in + insize) <= endin);
6959
6960 if (outsize <= 0) {
6961 Py_ssize_t startinpos, endinpos, outpos;
6962
Victor Stinner7d00cc12014-03-17 23:08:06 +01006963 /* last character in partial decode? */
6964 if (in + insize >= endin && !final)
6965 break;
6966
Victor Stinner3a50e702011-10-18 21:21:00 +02006967 startinpos = in - startin;
6968 endinpos = startinpos + 1;
6969 outpos = out - PyUnicode_AS_UNICODE(*v);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006970 if (unicode_decode_call_errorhandler_wchar(
Victor Stinner3a50e702011-10-18 21:21:00 +02006971 errors, &errorHandler,
6972 encoding, reason,
6973 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01006974 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02006975 {
6976 goto error;
6977 }
Victor Stinner596a6c42011-11-09 00:02:18 +01006978 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02006979 }
6980 else {
6981 in += insize;
6982 memcpy(out, buffer, outsize * sizeof(wchar_t));
6983 out += outsize;
6984 }
6985 }
6986
6987 /* write a NUL character at the end */
6988 *out = 0;
6989
6990 /* Extend unicode object */
6991 outsize = out - startout;
6992 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner16e6a802011-12-12 13:24:15 +01006993 if (unicode_resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02006994 goto error;
Victor Stinnere1f17c62014-07-25 14:03:03 +02006995 /* (in - startin) <= size and size is an int */
6996 ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
Victor Stinner3a50e702011-10-18 21:21:00 +02006997
6998error:
6999 Py_XDECREF(encoding_obj);
7000 Py_XDECREF(errorHandler);
7001 Py_XDECREF(exc);
7002 return ret;
7003}
7004
Victor Stinner3a50e702011-10-18 21:21:00 +02007005static PyObject *
7006decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007007 const char *s, Py_ssize_t size,
7008 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007009{
Victor Stinner76a31a62011-11-04 00:05:13 +01007010 PyObject *v = NULL;
7011 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007012
Victor Stinner3a50e702011-10-18 21:21:00 +02007013 if (code_page < 0) {
7014 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7015 return NULL;
7016 }
7017
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007018 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007019 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007020
Victor Stinner76a31a62011-11-04 00:05:13 +01007021 do
7022 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007023#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01007024 if (size > INT_MAX) {
7025 chunk_size = INT_MAX;
7026 final = 0;
7027 done = 0;
7028 }
7029 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007030#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007031 {
7032 chunk_size = (int)size;
7033 final = (consumed == NULL);
7034 done = 1;
7035 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007036
Victor Stinner76a31a62011-11-04 00:05:13 +01007037 if (chunk_size == 0 && done) {
7038 if (v != NULL)
7039 break;
Serhiy Storchaka678db842013-01-26 12:16:36 +02007040 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner76a31a62011-11-04 00:05:13 +01007041 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007042
Victor Stinner76a31a62011-11-04 00:05:13 +01007043 converted = decode_code_page_strict(code_page, &v,
7044 s, chunk_size);
7045 if (converted == -2)
7046 converted = decode_code_page_errors(code_page, &v,
7047 s, chunk_size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007048 errors, final);
7049 assert(converted != 0 || done);
Victor Stinner76a31a62011-11-04 00:05:13 +01007050
7051 if (converted < 0) {
7052 Py_XDECREF(v);
7053 return NULL;
7054 }
7055
7056 if (consumed)
7057 *consumed += converted;
7058
7059 s += converted;
7060 size -= converted;
7061 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007062
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007063 return unicode_result(v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007064}
7065
Alexander Belopolsky40018472011-02-26 01:02:56 +00007066PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007067PyUnicode_DecodeCodePageStateful(int code_page,
7068 const char *s,
7069 Py_ssize_t size,
7070 const char *errors,
7071 Py_ssize_t *consumed)
7072{
7073 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7074}
7075
7076PyObject *
7077PyUnicode_DecodeMBCSStateful(const char *s,
7078 Py_ssize_t size,
7079 const char *errors,
7080 Py_ssize_t *consumed)
7081{
7082 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7083}
7084
7085PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007086PyUnicode_DecodeMBCS(const char *s,
7087 Py_ssize_t size,
7088 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007089{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007090 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7091}
7092
Victor Stinner3a50e702011-10-18 21:21:00 +02007093static DWORD
7094encode_code_page_flags(UINT code_page, const char *errors)
7095{
7096 if (code_page == CP_UTF8) {
Steve Dower3e96f322015-03-02 08:01:10 -08007097 return WC_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007098 }
7099 else if (code_page == CP_UTF7) {
7100 /* CP_UTF7 only supports flags=0 */
7101 return 0;
7102 }
7103 else {
7104 if (errors != NULL && strcmp(errors, "replace") == 0)
7105 return 0;
7106 else
7107 return WC_NO_BEST_FIT_CHARS;
7108 }
7109}
7110
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007111/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007112 * Encode a Unicode string to a Windows code page into a byte string in strict
7113 * mode.
7114 *
7115 * Returns consumed characters if succeed, returns -2 on encode error, or raise
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007116 * an OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007117 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007118static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007119encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007120 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007121 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007122{
Victor Stinner554f3f02010-06-16 23:33:54 +00007123 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007124 BOOL *pusedDefaultChar = &usedDefaultChar;
7125 int outsize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007126 PyObject *exc = NULL;
Victor Stinner24729f32011-11-10 20:31:37 +01007127 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007128 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007129 const DWORD flags = encode_code_page_flags(code_page, NULL);
7130 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007131 /* Create a substring so that we can get the UTF-16 representation
7132 of just the slice under consideration. */
7133 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007134
Martin v. Löwis3d325192011-11-04 18:23:06 +01007135 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007136
Victor Stinner3a50e702011-10-18 21:21:00 +02007137 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007138 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007139 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007140 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007141
Victor Stinner2fc507f2011-11-04 20:06:39 +01007142 substring = PyUnicode_Substring(unicode, offset, offset+len);
7143 if (substring == NULL)
7144 return -1;
7145 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7146 if (p == NULL) {
7147 Py_DECREF(substring);
7148 return -1;
7149 }
Victor Stinner9f067f42013-06-05 00:21:31 +02007150 assert(size <= INT_MAX);
Martin v. Löwis3d325192011-11-04 18:23:06 +01007151
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007152 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007153 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007154 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007155 NULL, 0,
7156 NULL, pusedDefaultChar);
7157 if (outsize <= 0)
7158 goto error;
7159 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007160 if (pusedDefaultChar && *pusedDefaultChar) {
7161 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007162 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007163 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007164
Victor Stinner3a50e702011-10-18 21:21:00 +02007165 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007166 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007167 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007168 if (*outbytes == NULL) {
7169 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007170 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007171 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007172 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007173 }
7174 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007175 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007176 const Py_ssize_t n = PyBytes_Size(*outbytes);
7177 if (outsize > PY_SSIZE_T_MAX - n) {
7178 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007179 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007180 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007181 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007182 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7183 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007184 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007185 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007186 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007187 }
7188
7189 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007190 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007191 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007192 out, outsize,
7193 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007194 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007195 if (outsize <= 0)
7196 goto error;
7197 if (pusedDefaultChar && *pusedDefaultChar)
7198 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007199 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007200
Victor Stinner3a50e702011-10-18 21:21:00 +02007201error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007202 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007203 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7204 return -2;
7205 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007206 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007207}
7208
Victor Stinner3a50e702011-10-18 21:21:00 +02007209/*
Serhiy Storchakad65c9492015-11-02 14:10:23 +02007210 * Encode a Unicode string to a Windows code page into a byte string using an
Victor Stinner3a50e702011-10-18 21:21:00 +02007211 * error handler.
7212 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007213 * Returns consumed characters if succeed, or raise an OSError and returns
Victor Stinner3a50e702011-10-18 21:21:00 +02007214 * -1 on other error.
7215 */
7216static int
7217encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007218 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007219 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007220{
Victor Stinner3a50e702011-10-18 21:21:00 +02007221 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007222 Py_ssize_t pos = unicode_offset;
7223 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007224 /* Ideally, we should get reason from FormatMessage. This is the Windows
7225 2000 English version of the message. */
7226 const char *reason = "invalid character";
7227 /* 4=maximum length of a UTF-8 sequence */
7228 char buffer[4];
7229 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7230 Py_ssize_t outsize;
7231 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007232 PyObject *errorHandler = NULL;
7233 PyObject *exc = NULL;
7234 PyObject *encoding_obj = NULL;
7235 char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007236 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007237 PyObject *rep;
7238 int ret = -1;
7239
7240 assert(insize > 0);
7241
7242 encoding = code_page_name(code_page, &encoding_obj);
7243 if (encoding == NULL)
7244 return -1;
7245
7246 if (errors == NULL || strcmp(errors, "strict") == 0) {
7247 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7248 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007249 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007250 if (exc != NULL) {
7251 PyCodec_StrictErrors(exc);
7252 Py_DECREF(exc);
7253 }
7254 Py_XDECREF(encoding_obj);
7255 return -1;
7256 }
7257
7258 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7259 pusedDefaultChar = &usedDefaultChar;
7260 else
7261 pusedDefaultChar = NULL;
7262
7263 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7264 PyErr_NoMemory();
7265 goto error;
7266 }
7267 outsize = insize * Py_ARRAY_LENGTH(buffer);
7268
7269 if (*outbytes == NULL) {
7270 /* Create string object */
7271 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7272 if (*outbytes == NULL)
7273 goto error;
7274 out = PyBytes_AS_STRING(*outbytes);
7275 }
7276 else {
7277 /* Extend string object */
7278 Py_ssize_t n = PyBytes_Size(*outbytes);
7279 if (n > PY_SSIZE_T_MAX - outsize) {
7280 PyErr_NoMemory();
7281 goto error;
7282 }
7283 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7284 goto error;
7285 out = PyBytes_AS_STRING(*outbytes) + n;
7286 }
7287
7288 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007289 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007290 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007291 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7292 wchar_t chars[2];
7293 int charsize;
7294 if (ch < 0x10000) {
7295 chars[0] = (wchar_t)ch;
7296 charsize = 1;
7297 }
7298 else {
Victor Stinner76df43d2012-10-30 01:42:39 +01007299 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7300 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007301 charsize = 2;
7302 }
7303
Victor Stinner3a50e702011-10-18 21:21:00 +02007304 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007305 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007306 buffer, Py_ARRAY_LENGTH(buffer),
7307 NULL, pusedDefaultChar);
7308 if (outsize > 0) {
7309 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7310 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007311 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007312 memcpy(out, buffer, outsize);
7313 out += outsize;
7314 continue;
7315 }
7316 }
7317 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7318 PyErr_SetFromWindowsErr(0);
7319 goto error;
7320 }
7321
Victor Stinner3a50e702011-10-18 21:21:00 +02007322 rep = unicode_encode_call_errorhandler(
7323 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007324 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007325 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007326 if (rep == NULL)
7327 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007328 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007329
7330 if (PyBytes_Check(rep)) {
7331 outsize = PyBytes_GET_SIZE(rep);
7332 if (outsize != 1) {
7333 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7334 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7335 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7336 Py_DECREF(rep);
7337 goto error;
7338 }
7339 out = PyBytes_AS_STRING(*outbytes) + offset;
7340 }
7341 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7342 out += outsize;
7343 }
7344 else {
7345 Py_ssize_t i;
7346 enum PyUnicode_Kind kind;
7347 void *data;
7348
Benjamin Petersonbac79492012-01-14 13:34:47 -05007349 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007350 Py_DECREF(rep);
7351 goto error;
7352 }
7353
7354 outsize = PyUnicode_GET_LENGTH(rep);
7355 if (outsize != 1) {
7356 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7357 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7358 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7359 Py_DECREF(rep);
7360 goto error;
7361 }
7362 out = PyBytes_AS_STRING(*outbytes) + offset;
7363 }
7364 kind = PyUnicode_KIND(rep);
7365 data = PyUnicode_DATA(rep);
7366 for (i=0; i < outsize; i++) {
7367 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7368 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007369 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007370 encoding, unicode,
7371 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007372 "unable to encode error handler result to ASCII");
7373 Py_DECREF(rep);
7374 goto error;
7375 }
7376 *out = (unsigned char)ch;
7377 out++;
7378 }
7379 }
7380 Py_DECREF(rep);
7381 }
7382 /* write a NUL byte */
7383 *out = 0;
7384 outsize = out - PyBytes_AS_STRING(*outbytes);
7385 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7386 if (_PyBytes_Resize(outbytes, outsize) < 0)
7387 goto error;
7388 ret = 0;
7389
7390error:
7391 Py_XDECREF(encoding_obj);
7392 Py_XDECREF(errorHandler);
7393 Py_XDECREF(exc);
7394 return ret;
7395}
7396
Victor Stinner3a50e702011-10-18 21:21:00 +02007397static PyObject *
7398encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007399 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007400 const char *errors)
7401{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007402 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007403 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007404 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007405 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007406
Victor Stinner29dacf22015-01-26 16:41:32 +01007407 if (!PyUnicode_Check(unicode)) {
7408 PyErr_BadArgument();
7409 return NULL;
7410 }
7411
Benjamin Petersonbac79492012-01-14 13:34:47 -05007412 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007413 return NULL;
7414 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007415
Victor Stinner3a50e702011-10-18 21:21:00 +02007416 if (code_page < 0) {
7417 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7418 return NULL;
7419 }
7420
Martin v. Löwis3d325192011-11-04 18:23:06 +01007421 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007422 return PyBytes_FromStringAndSize(NULL, 0);
7423
Victor Stinner7581cef2011-11-03 22:32:33 +01007424 offset = 0;
7425 do
7426 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007427#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007428 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007429 chunks. */
7430 if (len > INT_MAX/2) {
7431 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007432 done = 0;
7433 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007434 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007435#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007436 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007437 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007438 done = 1;
7439 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007440
Victor Stinner76a31a62011-11-04 00:05:13 +01007441 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007442 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007443 errors);
7444 if (ret == -2)
7445 ret = encode_code_page_errors(code_page, &outbytes,
7446 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007447 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007448 if (ret < 0) {
7449 Py_XDECREF(outbytes);
7450 return NULL;
7451 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007452
Victor Stinner7581cef2011-11-03 22:32:33 +01007453 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007454 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007455 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007456
Victor Stinner3a50e702011-10-18 21:21:00 +02007457 return outbytes;
7458}
7459
7460PyObject *
7461PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7462 Py_ssize_t size,
7463 const char *errors)
7464{
Victor Stinner7581cef2011-11-03 22:32:33 +01007465 PyObject *unicode, *res;
7466 unicode = PyUnicode_FromUnicode(p, size);
7467 if (unicode == NULL)
7468 return NULL;
7469 res = encode_code_page(CP_ACP, unicode, errors);
7470 Py_DECREF(unicode);
7471 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007472}
7473
7474PyObject *
7475PyUnicode_EncodeCodePage(int code_page,
7476 PyObject *unicode,
7477 const char *errors)
7478{
Victor Stinner7581cef2011-11-03 22:32:33 +01007479 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007480}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007481
Alexander Belopolsky40018472011-02-26 01:02:56 +00007482PyObject *
7483PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007484{
Victor Stinner7581cef2011-11-03 22:32:33 +01007485 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007486}
7487
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007488#undef NEED_RETRY
7489
Victor Stinner99b95382011-07-04 14:23:54 +02007490#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007491
Guido van Rossumd57fd912000-03-10 22:53:23 +00007492/* --- Character Mapping Codec -------------------------------------------- */
7493
Victor Stinnerfb161b12013-04-18 01:44:27 +02007494static int
7495charmap_decode_string(const char *s,
7496 Py_ssize_t size,
7497 PyObject *mapping,
7498 const char *errors,
7499 _PyUnicodeWriter *writer)
7500{
7501 const char *starts = s;
7502 const char *e;
7503 Py_ssize_t startinpos, endinpos;
7504 PyObject *errorHandler = NULL, *exc = NULL;
7505 Py_ssize_t maplen;
7506 enum PyUnicode_Kind mapkind;
7507 void *mapdata;
7508 Py_UCS4 x;
7509 unsigned char ch;
7510
7511 if (PyUnicode_READY(mapping) == -1)
7512 return -1;
7513
7514 maplen = PyUnicode_GET_LENGTH(mapping);
7515 mapdata = PyUnicode_DATA(mapping);
7516 mapkind = PyUnicode_KIND(mapping);
7517
7518 e = s + size;
7519
7520 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
7521 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
7522 * is disabled in encoding aliases, latin1 is preferred because
7523 * its implementation is faster. */
7524 Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata;
7525 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7526 Py_UCS4 maxchar = writer->maxchar;
7527
7528 assert (writer->kind == PyUnicode_1BYTE_KIND);
7529 while (s < e) {
7530 ch = *s;
7531 x = mapdata_ucs1[ch];
7532 if (x > maxchar) {
7533 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
7534 goto onError;
7535 maxchar = writer->maxchar;
7536 outdata = (Py_UCS1 *)writer->data;
7537 }
7538 outdata[writer->pos] = x;
7539 writer->pos++;
7540 ++s;
7541 }
7542 return 0;
7543 }
7544
7545 while (s < e) {
7546 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
7547 enum PyUnicode_Kind outkind = writer->kind;
7548 Py_UCS2 *mapdata_ucs2 = (Py_UCS2 *)mapdata;
7549 if (outkind == PyUnicode_1BYTE_KIND) {
7550 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7551 Py_UCS4 maxchar = writer->maxchar;
7552 while (s < e) {
7553 ch = *s;
7554 x = mapdata_ucs2[ch];
7555 if (x > maxchar)
7556 goto Error;
7557 outdata[writer->pos] = x;
7558 writer->pos++;
7559 ++s;
7560 }
7561 break;
7562 }
7563 else if (outkind == PyUnicode_2BYTE_KIND) {
7564 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
7565 while (s < e) {
7566 ch = *s;
7567 x = mapdata_ucs2[ch];
7568 if (x == 0xFFFE)
7569 goto Error;
7570 outdata[writer->pos] = x;
7571 writer->pos++;
7572 ++s;
7573 }
7574 break;
7575 }
7576 }
7577 ch = *s;
7578
7579 if (ch < maplen)
7580 x = PyUnicode_READ(mapkind, mapdata, ch);
7581 else
7582 x = 0xfffe; /* invalid value */
7583Error:
7584 if (x == 0xfffe)
7585 {
7586 /* undefined mapping */
7587 startinpos = s-starts;
7588 endinpos = startinpos+1;
7589 if (unicode_decode_call_errorhandler_writer(
7590 errors, &errorHandler,
7591 "charmap", "character maps to <undefined>",
7592 &starts, &e, &startinpos, &endinpos, &exc, &s,
7593 writer)) {
7594 goto onError;
7595 }
7596 continue;
7597 }
7598
7599 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
7600 goto onError;
7601 ++s;
7602 }
7603 Py_XDECREF(errorHandler);
7604 Py_XDECREF(exc);
7605 return 0;
7606
7607onError:
7608 Py_XDECREF(errorHandler);
7609 Py_XDECREF(exc);
7610 return -1;
7611}
7612
7613static int
7614charmap_decode_mapping(const char *s,
7615 Py_ssize_t size,
7616 PyObject *mapping,
7617 const char *errors,
7618 _PyUnicodeWriter *writer)
7619{
7620 const char *starts = s;
7621 const char *e;
7622 Py_ssize_t startinpos, endinpos;
7623 PyObject *errorHandler = NULL, *exc = NULL;
7624 unsigned char ch;
Victor Stinnerf4f24242013-05-07 01:01:31 +02007625 PyObject *key, *item = NULL;
Victor Stinnerfb161b12013-04-18 01:44:27 +02007626
7627 e = s + size;
7628
7629 while (s < e) {
7630 ch = *s;
7631
7632 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7633 key = PyLong_FromLong((long)ch);
7634 if (key == NULL)
7635 goto onError;
7636
7637 item = PyObject_GetItem(mapping, key);
7638 Py_DECREF(key);
7639 if (item == NULL) {
7640 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7641 /* No mapping found means: mapping is undefined. */
7642 PyErr_Clear();
7643 goto Undefined;
7644 } else
7645 goto onError;
7646 }
7647
7648 /* Apply mapping */
7649 if (item == Py_None)
7650 goto Undefined;
7651 if (PyLong_Check(item)) {
7652 long value = PyLong_AS_LONG(item);
7653 if (value == 0xFFFE)
7654 goto Undefined;
7655 if (value < 0 || value > MAX_UNICODE) {
7656 PyErr_Format(PyExc_TypeError,
7657 "character mapping must be in range(0x%lx)",
7658 (unsigned long)MAX_UNICODE + 1);
7659 goto onError;
7660 }
7661
7662 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7663 goto onError;
7664 }
7665 else if (PyUnicode_Check(item)) {
7666 if (PyUnicode_READY(item) == -1)
7667 goto onError;
7668 if (PyUnicode_GET_LENGTH(item) == 1) {
7669 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
7670 if (value == 0xFFFE)
7671 goto Undefined;
7672 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7673 goto onError;
7674 }
7675 else {
7676 writer->overallocate = 1;
7677 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
7678 goto onError;
7679 }
7680 }
7681 else {
7682 /* wrong return value */
7683 PyErr_SetString(PyExc_TypeError,
7684 "character mapping must return integer, None or str");
7685 goto onError;
7686 }
7687 Py_CLEAR(item);
7688 ++s;
7689 continue;
7690
7691Undefined:
7692 /* undefined mapping */
7693 Py_CLEAR(item);
7694 startinpos = s-starts;
7695 endinpos = startinpos+1;
7696 if (unicode_decode_call_errorhandler_writer(
7697 errors, &errorHandler,
7698 "charmap", "character maps to <undefined>",
7699 &starts, &e, &startinpos, &endinpos, &exc, &s,
7700 writer)) {
7701 goto onError;
7702 }
7703 }
7704 Py_XDECREF(errorHandler);
7705 Py_XDECREF(exc);
7706 return 0;
7707
7708onError:
7709 Py_XDECREF(item);
7710 Py_XDECREF(errorHandler);
7711 Py_XDECREF(exc);
7712 return -1;
7713}
7714
Alexander Belopolsky40018472011-02-26 01:02:56 +00007715PyObject *
7716PyUnicode_DecodeCharmap(const char *s,
7717 Py_ssize_t size,
7718 PyObject *mapping,
7719 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007720{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007721 _PyUnicodeWriter writer;
Tim Petersced69f82003-09-16 20:30:58 +00007722
Guido van Rossumd57fd912000-03-10 22:53:23 +00007723 /* Default to Latin-1 */
7724 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007725 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007726
Guido van Rossumd57fd912000-03-10 22:53:23 +00007727 if (size == 0)
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02007728 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner8f674cc2013-04-17 23:02:17 +02007729 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02007730 writer.min_length = size;
7731 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007732 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007733
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007734 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerfb161b12013-04-18 01:44:27 +02007735 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
7736 goto onError;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007737 }
7738 else {
Victor Stinnerfb161b12013-04-18 01:44:27 +02007739 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
7740 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007741 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007742 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00007743
Benjamin Peterson29060642009-01-31 22:14:21 +00007744 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007745 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007746 return NULL;
7747}
7748
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007749/* Charmap encoding: the lookup table */
7750
Alexander Belopolsky40018472011-02-26 01:02:56 +00007751struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00007752 PyObject_HEAD
7753 unsigned char level1[32];
7754 int count2, count3;
7755 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007756};
7757
7758static PyObject*
7759encoding_map_size(PyObject *obj, PyObject* args)
7760{
7761 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007762 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00007763 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007764}
7765
7766static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007767 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00007768 PyDoc_STR("Return the size (in bytes) of this object") },
7769 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007770};
7771
7772static void
7773encoding_map_dealloc(PyObject* o)
7774{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007775 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007776}
7777
7778static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007779 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007780 "EncodingMap", /*tp_name*/
7781 sizeof(struct encoding_map), /*tp_basicsize*/
7782 0, /*tp_itemsize*/
7783 /* methods */
7784 encoding_map_dealloc, /*tp_dealloc*/
7785 0, /*tp_print*/
7786 0, /*tp_getattr*/
7787 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00007788 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00007789 0, /*tp_repr*/
7790 0, /*tp_as_number*/
7791 0, /*tp_as_sequence*/
7792 0, /*tp_as_mapping*/
7793 0, /*tp_hash*/
7794 0, /*tp_call*/
7795 0, /*tp_str*/
7796 0, /*tp_getattro*/
7797 0, /*tp_setattro*/
7798 0, /*tp_as_buffer*/
7799 Py_TPFLAGS_DEFAULT, /*tp_flags*/
7800 0, /*tp_doc*/
7801 0, /*tp_traverse*/
7802 0, /*tp_clear*/
7803 0, /*tp_richcompare*/
7804 0, /*tp_weaklistoffset*/
7805 0, /*tp_iter*/
7806 0, /*tp_iternext*/
7807 encoding_map_methods, /*tp_methods*/
7808 0, /*tp_members*/
7809 0, /*tp_getset*/
7810 0, /*tp_base*/
7811 0, /*tp_dict*/
7812 0, /*tp_descr_get*/
7813 0, /*tp_descr_set*/
7814 0, /*tp_dictoffset*/
7815 0, /*tp_init*/
7816 0, /*tp_alloc*/
7817 0, /*tp_new*/
7818 0, /*tp_free*/
7819 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007820};
7821
7822PyObject*
7823PyUnicode_BuildEncodingMap(PyObject* string)
7824{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007825 PyObject *result;
7826 struct encoding_map *mresult;
7827 int i;
7828 int need_dict = 0;
7829 unsigned char level1[32];
7830 unsigned char level2[512];
7831 unsigned char *mlevel1, *mlevel2, *mlevel3;
7832 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007833 int kind;
7834 void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007835 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007836 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007837
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007838 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007839 PyErr_BadArgument();
7840 return NULL;
7841 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007842 kind = PyUnicode_KIND(string);
7843 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007844 length = PyUnicode_GET_LENGTH(string);
7845 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007846 memset(level1, 0xFF, sizeof level1);
7847 memset(level2, 0xFF, sizeof level2);
7848
7849 /* If there isn't a one-to-one mapping of NULL to \0,
7850 or if there are non-BMP characters, we need to use
7851 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007852 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007853 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007854 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007855 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007856 ch = PyUnicode_READ(kind, data, i);
7857 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007858 need_dict = 1;
7859 break;
7860 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007861 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007862 /* unmapped character */
7863 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007864 l1 = ch >> 11;
7865 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007866 if (level1[l1] == 0xFF)
7867 level1[l1] = count2++;
7868 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00007869 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007870 }
7871
7872 if (count2 >= 0xFF || count3 >= 0xFF)
7873 need_dict = 1;
7874
7875 if (need_dict) {
7876 PyObject *result = PyDict_New();
7877 PyObject *key, *value;
7878 if (!result)
7879 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007880 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007881 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00007882 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007883 if (!key || !value)
7884 goto failed1;
7885 if (PyDict_SetItem(result, key, value) == -1)
7886 goto failed1;
7887 Py_DECREF(key);
7888 Py_DECREF(value);
7889 }
7890 return result;
7891 failed1:
7892 Py_XDECREF(key);
7893 Py_XDECREF(value);
7894 Py_DECREF(result);
7895 return NULL;
7896 }
7897
7898 /* Create a three-level trie */
7899 result = PyObject_MALLOC(sizeof(struct encoding_map) +
7900 16*count2 + 128*count3 - 1);
7901 if (!result)
7902 return PyErr_NoMemory();
7903 PyObject_Init(result, &EncodingMapType);
7904 mresult = (struct encoding_map*)result;
7905 mresult->count2 = count2;
7906 mresult->count3 = count3;
7907 mlevel1 = mresult->level1;
7908 mlevel2 = mresult->level23;
7909 mlevel3 = mresult->level23 + 16*count2;
7910 memcpy(mlevel1, level1, 32);
7911 memset(mlevel2, 0xFF, 16*count2);
7912 memset(mlevel3, 0, 128*count3);
7913 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007914 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007915 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007916 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7917 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007918 /* unmapped character */
7919 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007920 o1 = ch>>11;
7921 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007922 i2 = 16*mlevel1[o1] + o2;
7923 if (mlevel2[i2] == 0xFF)
7924 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007925 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007926 i3 = 128*mlevel2[i2] + o3;
7927 mlevel3[i3] = i;
7928 }
7929 return result;
7930}
7931
7932static int
Victor Stinner22168992011-11-20 17:09:18 +01007933encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007934{
7935 struct encoding_map *map = (struct encoding_map*)mapping;
7936 int l1 = c>>11;
7937 int l2 = (c>>7) & 0xF;
7938 int l3 = c & 0x7F;
7939 int i;
7940
Victor Stinner22168992011-11-20 17:09:18 +01007941 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00007942 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007943 if (c == 0)
7944 return 0;
7945 /* level 1*/
7946 i = map->level1[l1];
7947 if (i == 0xFF) {
7948 return -1;
7949 }
7950 /* level 2*/
7951 i = map->level23[16*i+l2];
7952 if (i == 0xFF) {
7953 return -1;
7954 }
7955 /* level 3 */
7956 i = map->level23[16*map->count2 + 128*i + l3];
7957 if (i == 0) {
7958 return -1;
7959 }
7960 return i;
7961}
7962
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007963/* Lookup the character ch in the mapping. If the character
7964 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00007965 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007966static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01007967charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007968{
Christian Heimes217cfd12007-12-02 14:31:20 +00007969 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007970 PyObject *x;
7971
7972 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007973 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007974 x = PyObject_GetItem(mapping, w);
7975 Py_DECREF(w);
7976 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007977 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7978 /* No mapping found means: mapping is undefined. */
7979 PyErr_Clear();
7980 x = Py_None;
7981 Py_INCREF(x);
7982 return x;
7983 } else
7984 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007985 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00007986 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007987 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00007988 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007989 long value = PyLong_AS_LONG(x);
7990 if (value < 0 || value > 255) {
7991 PyErr_SetString(PyExc_TypeError,
7992 "character mapping must be in range(256)");
7993 Py_DECREF(x);
7994 return NULL;
7995 }
7996 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007997 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007998 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007999 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008000 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008001 /* wrong return value */
8002 PyErr_Format(PyExc_TypeError,
8003 "character mapping must return integer, bytes or None, not %.400s",
8004 x->ob_type->tp_name);
8005 Py_DECREF(x);
8006 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008007 }
8008}
8009
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008010static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008011charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008012{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008013 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8014 /* exponentially overallocate to minimize reallocations */
8015 if (requiredsize < 2*outsize)
8016 requiredsize = 2*outsize;
8017 if (_PyBytes_Resize(outobj, requiredsize))
8018 return -1;
8019 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008020}
8021
Benjamin Peterson14339b62009-01-31 16:36:08 +00008022typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008023 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008024} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008025/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008026 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008027 space is available. Return a new reference to the object that
8028 was put in the output buffer, or Py_None, if the mapping was undefined
8029 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008030 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008031static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008032charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008033 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008034{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008035 PyObject *rep;
8036 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008037 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008038
Christian Heimes90aa7642007-12-19 02:45:37 +00008039 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008040 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008041 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008042 if (res == -1)
8043 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008044 if (outsize<requiredsize)
8045 if (charmapencode_resize(outobj, outpos, requiredsize))
8046 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008047 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008048 outstart[(*outpos)++] = (char)res;
8049 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008050 }
8051
8052 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008053 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008054 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008055 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008056 Py_DECREF(rep);
8057 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008058 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008059 if (PyLong_Check(rep)) {
8060 Py_ssize_t requiredsize = *outpos+1;
8061 if (outsize<requiredsize)
8062 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8063 Py_DECREF(rep);
8064 return enc_EXCEPTION;
8065 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008066 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008067 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008068 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008069 else {
8070 const char *repchars = PyBytes_AS_STRING(rep);
8071 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8072 Py_ssize_t requiredsize = *outpos+repsize;
8073 if (outsize<requiredsize)
8074 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8075 Py_DECREF(rep);
8076 return enc_EXCEPTION;
8077 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008078 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008079 memcpy(outstart + *outpos, repchars, repsize);
8080 *outpos += repsize;
8081 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008082 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008083 Py_DECREF(rep);
8084 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008085}
8086
8087/* handle an error in PyUnicode_EncodeCharmap
8088 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008089static int
8090charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008091 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008092 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00008093 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008094 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008095{
8096 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008097 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008098 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008099 enum PyUnicode_Kind kind;
8100 void *data;
8101 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008102 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008103 Py_ssize_t collstartpos = *inpos;
8104 Py_ssize_t collendpos = *inpos+1;
8105 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008106 char *encoding = "charmap";
8107 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008108 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008109 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008110 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008111
Benjamin Petersonbac79492012-01-14 13:34:47 -05008112 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008113 return -1;
8114 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008115 /* find all unencodable characters */
8116 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008117 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008118 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008119 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008120 val = encoding_map_lookup(ch, mapping);
8121 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008122 break;
8123 ++collendpos;
8124 continue;
8125 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008126
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008127 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8128 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008129 if (rep==NULL)
8130 return -1;
8131 else if (rep!=Py_None) {
8132 Py_DECREF(rep);
8133 break;
8134 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008135 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008136 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008137 }
8138 /* cache callback name lookup
8139 * (if not done yet, i.e. it's the first error) */
8140 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008141 if ((errors==NULL) || (!strcmp(errors, "strict")))
8142 *known_errorHandler = 1;
8143 else if (!strcmp(errors, "replace"))
8144 *known_errorHandler = 2;
8145 else if (!strcmp(errors, "ignore"))
8146 *known_errorHandler = 3;
8147 else if (!strcmp(errors, "xmlcharrefreplace"))
8148 *known_errorHandler = 4;
8149 else
8150 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008151 }
8152 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008153 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008154 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008155 return -1;
8156 case 2: /* replace */
8157 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008158 x = charmapencode_output('?', mapping, res, respos);
8159 if (x==enc_EXCEPTION) {
8160 return -1;
8161 }
8162 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008163 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008164 return -1;
8165 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008166 }
8167 /* fall through */
8168 case 3: /* ignore */
8169 *inpos = collendpos;
8170 break;
8171 case 4: /* xmlcharrefreplace */
8172 /* generate replacement (temporarily (mis)uses p) */
8173 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008174 char buffer[2+29+1+1];
8175 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008176 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008177 for (cp = buffer; *cp; ++cp) {
8178 x = charmapencode_output(*cp, mapping, res, respos);
8179 if (x==enc_EXCEPTION)
8180 return -1;
8181 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008182 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008183 return -1;
8184 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008185 }
8186 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008187 *inpos = collendpos;
8188 break;
8189 default:
8190 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008191 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008192 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008193 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008194 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008195 if (PyBytes_Check(repunicode)) {
8196 /* Directly copy bytes result to output. */
8197 Py_ssize_t outsize = PyBytes_Size(*res);
8198 Py_ssize_t requiredsize;
8199 repsize = PyBytes_Size(repunicode);
8200 requiredsize = *respos + repsize;
8201 if (requiredsize > outsize)
8202 /* Make room for all additional bytes. */
8203 if (charmapencode_resize(res, respos, requiredsize)) {
8204 Py_DECREF(repunicode);
8205 return -1;
8206 }
8207 memcpy(PyBytes_AsString(*res) + *respos,
8208 PyBytes_AsString(repunicode), repsize);
8209 *respos += repsize;
8210 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008211 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008212 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008213 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008214 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008215 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008216 Py_DECREF(repunicode);
8217 return -1;
8218 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008219 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008220 data = PyUnicode_DATA(repunicode);
8221 kind = PyUnicode_KIND(repunicode);
8222 for (index = 0; index < repsize; index++) {
8223 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8224 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008225 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008226 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008227 return -1;
8228 }
8229 else if (x==enc_FAILED) {
8230 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008231 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008232 return -1;
8233 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008234 }
8235 *inpos = newpos;
8236 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008237 }
8238 return 0;
8239}
8240
Alexander Belopolsky40018472011-02-26 01:02:56 +00008241PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008242_PyUnicode_EncodeCharmap(PyObject *unicode,
8243 PyObject *mapping,
8244 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008245{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008246 /* output object */
8247 PyObject *res = NULL;
8248 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008249 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008250 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008251 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008252 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008253 PyObject *errorHandler = NULL;
8254 PyObject *exc = NULL;
8255 /* the following variable is used for caching string comparisons
8256 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8257 * 3=ignore, 4=xmlcharrefreplace */
8258 int known_errorHandler = -1;
Victor Stinner69ed0f42013-04-09 21:48:24 +02008259 void *data;
8260 int kind;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008261
Benjamin Petersonbac79492012-01-14 13:34:47 -05008262 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008263 return NULL;
8264 size = PyUnicode_GET_LENGTH(unicode);
Victor Stinner69ed0f42013-04-09 21:48:24 +02008265 data = PyUnicode_DATA(unicode);
8266 kind = PyUnicode_KIND(unicode);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008267
Guido van Rossumd57fd912000-03-10 22:53:23 +00008268 /* Default to Latin-1 */
8269 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008270 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008271
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008272 /* allocate enough for a simple encoding without
8273 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008274 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008275 if (res == NULL)
8276 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008277 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008278 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008279
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008280 while (inpos<size) {
Victor Stinner69ed0f42013-04-09 21:48:24 +02008281 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008282 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008283 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008284 if (x==enc_EXCEPTION) /* error */
8285 goto onError;
8286 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008287 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008288 &exc,
8289 &known_errorHandler, &errorHandler, errors,
8290 &res, &respos)) {
8291 goto onError;
8292 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008293 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008294 else
8295 /* done with this character => adjust input position */
8296 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008297 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008298
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008299 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008300 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008301 if (_PyBytes_Resize(&res, respos) < 0)
8302 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008303
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008304 Py_XDECREF(exc);
8305 Py_XDECREF(errorHandler);
8306 return res;
8307
Benjamin Peterson29060642009-01-31 22:14:21 +00008308 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008309 Py_XDECREF(res);
8310 Py_XDECREF(exc);
8311 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008312 return NULL;
8313}
8314
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008315/* Deprecated */
8316PyObject *
8317PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8318 Py_ssize_t size,
8319 PyObject *mapping,
8320 const char *errors)
8321{
8322 PyObject *result;
8323 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8324 if (unicode == NULL)
8325 return NULL;
8326 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8327 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008328 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008329}
8330
Alexander Belopolsky40018472011-02-26 01:02:56 +00008331PyObject *
8332PyUnicode_AsCharmapString(PyObject *unicode,
8333 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008334{
8335 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008336 PyErr_BadArgument();
8337 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008338 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008339 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008340}
8341
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008342/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008343static void
8344make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008345 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008346 Py_ssize_t startpos, Py_ssize_t endpos,
8347 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008348{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008349 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008350 *exceptionObject = _PyUnicodeTranslateError_Create(
8351 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008352 }
8353 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008354 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8355 goto onError;
8356 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8357 goto onError;
8358 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8359 goto onError;
8360 return;
8361 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02008362 Py_CLEAR(*exceptionObject);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008363 }
8364}
8365
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008366/* error handling callback helper:
8367 build arguments, call the callback and check the arguments,
8368 put the result into newpos and return the replacement string, which
8369 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008370static PyObject *
8371unicode_translate_call_errorhandler(const char *errors,
8372 PyObject **errorHandler,
8373 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008374 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008375 Py_ssize_t startpos, Py_ssize_t endpos,
8376 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008377{
Benjamin Peterson142957c2008-07-04 19:55:29 +00008378 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008379
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008380 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008381 PyObject *restuple;
8382 PyObject *resunicode;
8383
8384 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008385 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008386 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008387 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008388 }
8389
8390 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008391 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008392 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008393 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008394
8395 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00008396 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008397 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008398 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008399 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00008400 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008401 Py_DECREF(restuple);
8402 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008403 }
8404 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00008405 &resunicode, &i_newpos)) {
8406 Py_DECREF(restuple);
8407 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008408 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008409 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008410 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008411 else
8412 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008413 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnera33bce02014-07-04 22:47:46 +02008414 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008415 Py_DECREF(restuple);
8416 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008417 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008418 Py_INCREF(resunicode);
8419 Py_DECREF(restuple);
8420 return resunicode;
8421}
8422
8423/* Lookup the character ch in the mapping and put the result in result,
8424 which must be decrefed by the caller.
8425 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008426static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008427charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008428{
Christian Heimes217cfd12007-12-02 14:31:20 +00008429 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008430 PyObject *x;
8431
8432 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008433 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008434 x = PyObject_GetItem(mapping, w);
8435 Py_DECREF(w);
8436 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008437 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8438 /* No mapping found means: use 1:1 mapping. */
8439 PyErr_Clear();
8440 *result = NULL;
8441 return 0;
8442 } else
8443 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008444 }
8445 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008446 *result = x;
8447 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008448 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008449 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008450 long value = PyLong_AS_LONG(x);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008451 if (value < 0 || value > MAX_UNICODE) {
8452 PyErr_Format(PyExc_ValueError,
8453 "character mapping must be in range(0x%x)",
8454 MAX_UNICODE+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008455 Py_DECREF(x);
8456 return -1;
8457 }
8458 *result = x;
8459 return 0;
8460 }
8461 else if (PyUnicode_Check(x)) {
8462 *result = x;
8463 return 0;
8464 }
8465 else {
8466 /* wrong return value */
8467 PyErr_SetString(PyExc_TypeError,
8468 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008469 Py_DECREF(x);
8470 return -1;
8471 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008472}
Victor Stinner1194ea02014-04-04 19:37:40 +02008473
8474/* lookup the character, write the result into the writer.
8475 Return 1 if the result was written into the writer, return 0 if the mapping
8476 was undefined, raise an exception return -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008477static int
Victor Stinner1194ea02014-04-04 19:37:40 +02008478charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
8479 _PyUnicodeWriter *writer)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008480{
Victor Stinner1194ea02014-04-04 19:37:40 +02008481 PyObject *item;
8482
8483 if (charmaptranslate_lookup(ch, mapping, &item))
Benjamin Peterson29060642009-01-31 22:14:21 +00008484 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008485
8486 if (item == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008487 /* not found => default to 1:1 mapping */
Victor Stinner1194ea02014-04-04 19:37:40 +02008488 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008489 return -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008490 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008491 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008492 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008493
8494 if (item == Py_None) {
8495 Py_DECREF(item);
8496 return 0;
8497 }
8498
8499 if (PyLong_Check(item)) {
Victor Stinner4ff33af2014-04-05 11:56:37 +02008500 long ch = (Py_UCS4)PyLong_AS_LONG(item);
8501 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8502 used it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008503 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
8504 Py_DECREF(item);
8505 return -1;
8506 }
8507 Py_DECREF(item);
8508 return 1;
8509 }
8510
8511 if (!PyUnicode_Check(item)) {
8512 Py_DECREF(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00008513 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008514 }
8515
8516 if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
8517 Py_DECREF(item);
8518 return -1;
8519 }
8520
8521 Py_DECREF(item);
8522 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008523}
8524
Victor Stinner89a76ab2014-04-05 11:44:04 +02008525static int
8526unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
8527 Py_UCS1 *translate)
8528{
Benjamin Peterson1365de72014-04-07 20:15:41 -04008529 PyObject *item = NULL;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008530 int ret = 0;
8531
Victor Stinner89a76ab2014-04-05 11:44:04 +02008532 if (charmaptranslate_lookup(ch, mapping, &item)) {
8533 return -1;
8534 }
8535
8536 if (item == Py_None) {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008537 /* deletion */
Victor Stinner872b2912014-04-05 14:27:07 +02008538 translate[ch] = 0xfe;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008539 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008540 else if (item == NULL) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008541 /* not found => default to 1:1 mapping */
8542 translate[ch] = ch;
8543 return 1;
8544 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008545 else if (PyLong_Check(item)) {
Victor Stinner4dd25252014-04-08 09:14:21 +02008546 long replace = PyLong_AS_LONG(item);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008547 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8548 used it */
8549 if (127 < replace) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008550 /* invalid character or character outside ASCII:
8551 skip the fast translate */
8552 goto exit;
8553 }
8554 translate[ch] = (Py_UCS1)replace;
8555 }
8556 else if (PyUnicode_Check(item)) {
8557 Py_UCS4 replace;
8558
8559 if (PyUnicode_READY(item) == -1) {
8560 Py_DECREF(item);
8561 return -1;
8562 }
8563 if (PyUnicode_GET_LENGTH(item) != 1)
8564 goto exit;
8565
8566 replace = PyUnicode_READ_CHAR(item, 0);
8567 if (replace > 127)
8568 goto exit;
8569 translate[ch] = (Py_UCS1)replace;
8570 }
8571 else {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008572 /* not None, NULL, long or unicode */
Victor Stinner89a76ab2014-04-05 11:44:04 +02008573 goto exit;
8574 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02008575 ret = 1;
8576
Benjamin Peterson1365de72014-04-07 20:15:41 -04008577 exit:
8578 Py_DECREF(item);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008579 return ret;
8580}
8581
8582/* Fast path for ascii => ascii translation. Return 1 if the whole string
8583 was translated into writer, return 0 if the input string was partially
8584 translated into writer, raise an exception and return -1 on error. */
8585static int
8586unicode_fast_translate(PyObject *input, PyObject *mapping,
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01008587 _PyUnicodeWriter *writer, int ignore,
8588 Py_ssize_t *input_pos)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008589{
Victor Stinner872b2912014-04-05 14:27:07 +02008590 Py_UCS1 ascii_table[128], ch, ch2;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008591 Py_ssize_t len;
8592 Py_UCS1 *in, *end, *out;
Victor Stinner872b2912014-04-05 14:27:07 +02008593 int res = 0;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008594
Victor Stinner89a76ab2014-04-05 11:44:04 +02008595 len = PyUnicode_GET_LENGTH(input);
8596
Victor Stinner872b2912014-04-05 14:27:07 +02008597 memset(ascii_table, 0xff, 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008598
8599 in = PyUnicode_1BYTE_DATA(input);
8600 end = in + len;
8601
8602 assert(PyUnicode_IS_ASCII(writer->buffer));
8603 assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
8604 out = PyUnicode_1BYTE_DATA(writer->buffer);
8605
Victor Stinner872b2912014-04-05 14:27:07 +02008606 for (; in < end; in++) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008607 ch = *in;
Victor Stinner872b2912014-04-05 14:27:07 +02008608 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008609 if (ch2 == 0xff) {
Victor Stinner872b2912014-04-05 14:27:07 +02008610 int translate = unicode_fast_translate_lookup(mapping, ch,
8611 ascii_table);
8612 if (translate < 0)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008613 return -1;
Victor Stinner872b2912014-04-05 14:27:07 +02008614 if (translate == 0)
8615 goto exit;
8616 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008617 }
Victor Stinner872b2912014-04-05 14:27:07 +02008618 if (ch2 == 0xfe) {
8619 if (ignore)
8620 continue;
8621 goto exit;
8622 }
8623 assert(ch2 < 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008624 *out = ch2;
Victor Stinner872b2912014-04-05 14:27:07 +02008625 out++;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008626 }
Victor Stinner872b2912014-04-05 14:27:07 +02008627 res = 1;
8628
8629exit:
8630 writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01008631 *input_pos = in - PyUnicode_1BYTE_DATA(input);
Victor Stinner872b2912014-04-05 14:27:07 +02008632 return res;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008633}
8634
Alexander Belopolsky40018472011-02-26 01:02:56 +00008635PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008636_PyUnicode_TranslateCharmap(PyObject *input,
8637 PyObject *mapping,
8638 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008639{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008640 /* input object */
Victor Stinner1194ea02014-04-04 19:37:40 +02008641 char *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008642 Py_ssize_t size, i;
8643 int kind;
8644 /* output buffer */
Victor Stinner1194ea02014-04-04 19:37:40 +02008645 _PyUnicodeWriter writer;
8646 /* error handler */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008647 char *reason = "character maps to <undefined>";
8648 PyObject *errorHandler = NULL;
8649 PyObject *exc = NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008650 int ignore;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008651 int res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008652
Guido van Rossumd57fd912000-03-10 22:53:23 +00008653 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008654 PyErr_BadArgument();
8655 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008656 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008657
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008658 if (PyUnicode_READY(input) == -1)
8659 return NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008660 data = (char*)PyUnicode_DATA(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008661 kind = PyUnicode_KIND(input);
8662 size = PyUnicode_GET_LENGTH(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008663
8664 if (size == 0) {
8665 Py_INCREF(input);
8666 return input;
8667 }
8668
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008669 /* allocate enough for a simple 1:1 translation without
8670 replacements, if we need more, we'll resize */
Victor Stinner1194ea02014-04-04 19:37:40 +02008671 _PyUnicodeWriter_Init(&writer);
8672 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008673 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008674
Victor Stinner872b2912014-04-05 14:27:07 +02008675 ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
8676
Victor Stinner33798672016-03-01 21:59:58 +01008677 if (PyUnicode_READY(input) == -1)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008678 return NULL;
Victor Stinner33798672016-03-01 21:59:58 +01008679 if (PyUnicode_IS_ASCII(input)) {
8680 res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
8681 if (res < 0) {
8682 _PyUnicodeWriter_Dealloc(&writer);
8683 return NULL;
8684 }
8685 if (res == 1)
8686 return _PyUnicodeWriter_Finish(&writer);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008687 }
Victor Stinner33798672016-03-01 21:59:58 +01008688 else {
8689 i = 0;
8690 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02008691
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008692 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008693 /* try to encode it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008694 int translate;
8695 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8696 Py_ssize_t newpos;
8697 /* startpos for collecting untranslatable chars */
8698 Py_ssize_t collstart;
8699 Py_ssize_t collend;
Victor Stinner1194ea02014-04-04 19:37:40 +02008700 Py_UCS4 ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008701
Victor Stinner1194ea02014-04-04 19:37:40 +02008702 ch = PyUnicode_READ(kind, data, i);
8703 translate = charmaptranslate_output(ch, mapping, &writer);
8704 if (translate < 0)
8705 goto onError;
8706
8707 if (translate != 0) {
8708 /* it worked => adjust input pointer */
8709 ++i;
8710 continue;
8711 }
8712
8713 /* untranslatable character */
8714 collstart = i;
8715 collend = i+1;
8716
8717 /* find all untranslatable characters */
8718 while (collend < size) {
8719 PyObject *x;
8720 ch = PyUnicode_READ(kind, data, collend);
8721 if (charmaptranslate_lookup(ch, mapping, &x))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008722 goto onError;
Victor Stinner1194ea02014-04-04 19:37:40 +02008723 Py_XDECREF(x);
8724 if (x != Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008725 break;
Victor Stinner1194ea02014-04-04 19:37:40 +02008726 ++collend;
8727 }
8728
8729 if (ignore) {
8730 i = collend;
8731 }
8732 else {
8733 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
8734 reason, input, &exc,
8735 collstart, collend, &newpos);
8736 if (repunicode == NULL)
8737 goto onError;
8738 if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008739 Py_DECREF(repunicode);
Victor Stinner1194ea02014-04-04 19:37:40 +02008740 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008741 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008742 Py_DECREF(repunicode);
8743 i = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008744 }
8745 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008746 Py_XDECREF(exc);
8747 Py_XDECREF(errorHandler);
Victor Stinner1194ea02014-04-04 19:37:40 +02008748 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008749
Benjamin Peterson29060642009-01-31 22:14:21 +00008750 onError:
Victor Stinner1194ea02014-04-04 19:37:40 +02008751 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008752 Py_XDECREF(exc);
8753 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008754 return NULL;
8755}
8756
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008757/* Deprecated. Use PyUnicode_Translate instead. */
8758PyObject *
8759PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8760 Py_ssize_t size,
8761 PyObject *mapping,
8762 const char *errors)
8763{
Christian Heimes5f520f42012-09-11 14:03:25 +02008764 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008765 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8766 if (!unicode)
8767 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02008768 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8769 Py_DECREF(unicode);
8770 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008771}
8772
Alexander Belopolsky40018472011-02-26 01:02:56 +00008773PyObject *
8774PyUnicode_Translate(PyObject *str,
8775 PyObject *mapping,
8776 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008777{
8778 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008779
Guido van Rossumd57fd912000-03-10 22:53:23 +00008780 str = PyUnicode_FromObject(str);
8781 if (str == NULL)
Christian Heimes5f520f42012-09-11 14:03:25 +02008782 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008783 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008784 Py_DECREF(str);
8785 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008786}
Tim Petersced69f82003-09-16 20:30:58 +00008787
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008788static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008789fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008790{
8791 /* No need to call PyUnicode_READY(self) because this function is only
8792 called as a callback from fixup() which does it already. */
8793 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8794 const int kind = PyUnicode_KIND(self);
8795 void *data = PyUnicode_DATA(self);
Victor Stinnere6abb482012-05-02 01:15:40 +02008796 Py_UCS4 maxchar = 127, ch, fixed;
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008797 int modified = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008798 Py_ssize_t i;
8799
8800 for (i = 0; i < len; ++i) {
8801 ch = PyUnicode_READ(kind, data, i);
8802 fixed = 0;
8803 if (ch > 127) {
8804 if (Py_UNICODE_ISSPACE(ch))
8805 fixed = ' ';
8806 else {
8807 const int decimal = Py_UNICODE_TODECIMAL(ch);
8808 if (decimal >= 0)
8809 fixed = '0' + decimal;
8810 }
8811 if (fixed != 0) {
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008812 modified = 1;
Benjamin Peterson7e303732013-06-10 09:19:46 -07008813 maxchar = Py_MAX(maxchar, fixed);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008814 PyUnicode_WRITE(kind, data, i, fixed);
8815 }
Victor Stinnere6abb482012-05-02 01:15:40 +02008816 else
Benjamin Peterson7e303732013-06-10 09:19:46 -07008817 maxchar = Py_MAX(maxchar, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008818 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008819 }
8820
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008821 return (modified) ? maxchar : 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008822}
8823
8824PyObject *
8825_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8826{
8827 if (!PyUnicode_Check(unicode)) {
8828 PyErr_BadInternalCall();
8829 return NULL;
8830 }
8831 if (PyUnicode_READY(unicode) == -1)
8832 return NULL;
8833 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8834 /* If the string is already ASCII, just return the same string */
8835 Py_INCREF(unicode);
8836 return unicode;
8837 }
Victor Stinner9310abb2011-10-05 00:59:23 +02008838 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008839}
8840
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008841PyObject *
8842PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8843 Py_ssize_t length)
8844{
Victor Stinnerf0124502011-11-21 23:12:56 +01008845 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008846 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01008847 Py_UCS4 maxchar;
8848 enum PyUnicode_Kind kind;
8849 void *data;
8850
Victor Stinner99d7ad02012-02-22 13:37:39 +01008851 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008852 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02008853 Py_UCS4 ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008854 if (ch > 127) {
8855 int decimal = Py_UNICODE_TODECIMAL(ch);
8856 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01008857 ch = '0' + decimal;
Benjamin Peterson7e303732013-06-10 09:19:46 -07008858 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008859 }
8860 }
Victor Stinnerf0124502011-11-21 23:12:56 +01008861
8862 /* Copy to a new string */
8863 decimal = PyUnicode_New(length, maxchar);
8864 if (decimal == NULL)
8865 return decimal;
8866 kind = PyUnicode_KIND(decimal);
8867 data = PyUnicode_DATA(decimal);
8868 /* Iterate over code points */
8869 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02008870 Py_UCS4 ch = s[i];
Victor Stinnerf0124502011-11-21 23:12:56 +01008871 if (ch > 127) {
8872 int decimal = Py_UNICODE_TODECIMAL(ch);
8873 if (decimal >= 0)
8874 ch = '0' + decimal;
8875 }
8876 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008877 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008878 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008879}
Guido van Rossum9e896b32000-04-05 20:11:21 +00008880/* --- Decimal Encoder ---------------------------------------------------- */
8881
Alexander Belopolsky40018472011-02-26 01:02:56 +00008882int
8883PyUnicode_EncodeDecimal(Py_UNICODE *s,
8884 Py_ssize_t length,
8885 char *output,
8886 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00008887{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008888 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01008889 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01008890 enum PyUnicode_Kind kind;
8891 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008892
8893 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008894 PyErr_BadArgument();
8895 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008896 }
8897
Victor Stinner42bf7752011-11-21 22:52:58 +01008898 unicode = PyUnicode_FromUnicode(s, length);
8899 if (unicode == NULL)
8900 return -1;
8901
Benjamin Petersonbac79492012-01-14 13:34:47 -05008902 if (PyUnicode_READY(unicode) == -1) {
Victor Stinner6345be92011-11-25 20:09:01 +01008903 Py_DECREF(unicode);
8904 return -1;
8905 }
Victor Stinner42bf7752011-11-21 22:52:58 +01008906 kind = PyUnicode_KIND(unicode);
8907 data = PyUnicode_DATA(unicode);
8908
Victor Stinnerb84d7232011-11-22 01:50:07 +01008909 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01008910 PyObject *exc;
8911 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00008912 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01008913 Py_ssize_t startpos;
8914
8915 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00008916
Benjamin Peterson29060642009-01-31 22:14:21 +00008917 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008918 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01008919 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008920 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008921 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008922 decimal = Py_UNICODE_TODECIMAL(ch);
8923 if (decimal >= 0) {
8924 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008925 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008926 continue;
8927 }
8928 if (0 < ch && ch < 256) {
8929 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008930 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008931 continue;
8932 }
Victor Stinner6345be92011-11-25 20:09:01 +01008933
Victor Stinner42bf7752011-11-21 22:52:58 +01008934 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01008935 exc = NULL;
8936 raise_encode_exception(&exc, "decimal", unicode,
8937 startpos, startpos+1,
8938 "invalid decimal Unicode string");
8939 Py_XDECREF(exc);
8940 Py_DECREF(unicode);
8941 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008942 }
8943 /* 0-terminate the output string */
8944 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01008945 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008946 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008947}
8948
Guido van Rossumd57fd912000-03-10 22:53:23 +00008949/* --- Helpers ------------------------------------------------------------ */
8950
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02008951/* helper macro to fixup start/end slice values */
8952#define ADJUST_INDICES(start, end, len) \
8953 if (end > len) \
8954 end = len; \
8955 else if (end < 0) { \
8956 end += len; \
8957 if (end < 0) \
8958 end = 0; \
8959 } \
8960 if (start < 0) { \
8961 start += len; \
8962 if (start < 0) \
8963 start = 0; \
8964 }
8965
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008966static Py_ssize_t
Victor Stinner794d5672011-10-10 03:21:36 +02008967any_find_slice(int direction, PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008968 Py_ssize_t start,
8969 Py_ssize_t end)
8970{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02008971 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008972 void *buf1, *buf2;
8973 Py_ssize_t len1, len2, result;
8974
8975 kind1 = PyUnicode_KIND(s1);
8976 kind2 = PyUnicode_KIND(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02008977 if (kind1 < kind2)
8978 return -1;
8979
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008980 len1 = PyUnicode_GET_LENGTH(s1);
8981 len2 = PyUnicode_GET_LENGTH(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02008982 ADJUST_INDICES(start, end, len1);
8983 if (end - start < len2)
8984 return -1;
8985
8986 buf1 = PyUnicode_DATA(s1);
8987 buf2 = PyUnicode_DATA(s2);
8988 if (len2 == 1) {
8989 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
8990 result = findchar((const char *)buf1 + kind1*start,
8991 kind1, end - start, ch, direction);
8992 if (result == -1)
8993 return -1;
8994 else
8995 return start + result;
8996 }
8997
8998 if (kind2 != kind1) {
8999 buf2 = _PyUnicode_AsKind(s2, kind1);
9000 if (!buf2)
9001 return -2;
9002 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009003
Victor Stinner794d5672011-10-10 03:21:36 +02009004 if (direction > 0) {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009005 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009006 case PyUnicode_1BYTE_KIND:
9007 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9008 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9009 else
9010 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9011 break;
9012 case PyUnicode_2BYTE_KIND:
9013 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9014 break;
9015 case PyUnicode_4BYTE_KIND:
9016 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9017 break;
9018 default:
9019 assert(0); result = -2;
9020 }
9021 }
9022 else {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009023 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009024 case PyUnicode_1BYTE_KIND:
9025 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9026 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9027 else
9028 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9029 break;
9030 case PyUnicode_2BYTE_KIND:
9031 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9032 break;
9033 case PyUnicode_4BYTE_KIND:
9034 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9035 break;
9036 default:
9037 assert(0); result = -2;
9038 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009039 }
9040
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009041 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009042 PyMem_Free(buf2);
9043
9044 return result;
9045}
9046
9047Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01009048_PyUnicode_InsertThousandsGrouping(
9049 PyObject *unicode, Py_ssize_t index,
9050 Py_ssize_t n_buffer,
9051 void *digits, Py_ssize_t n_digits,
9052 Py_ssize_t min_width,
9053 const char *grouping, PyObject *thousands_sep,
9054 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009055{
Victor Stinner41a863c2012-02-24 00:37:51 +01009056 unsigned int kind, thousands_sep_kind;
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009057 char *data, *thousands_sep_data;
Victor Stinner41a863c2012-02-24 00:37:51 +01009058 Py_ssize_t thousands_sep_len;
9059 Py_ssize_t len;
9060
9061 if (unicode != NULL) {
9062 kind = PyUnicode_KIND(unicode);
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009063 data = (char *) PyUnicode_DATA(unicode) + index * kind;
Victor Stinner41a863c2012-02-24 00:37:51 +01009064 }
9065 else {
9066 kind = PyUnicode_1BYTE_KIND;
9067 data = NULL;
9068 }
9069 thousands_sep_kind = PyUnicode_KIND(thousands_sep);
9070 thousands_sep_data = PyUnicode_DATA(thousands_sep);
9071 thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9072 if (unicode != NULL && thousands_sep_kind != kind) {
Victor Stinner90f50d42012-02-24 01:44:47 +01009073 if (thousands_sep_kind < kind) {
9074 thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind);
9075 if (!thousands_sep_data)
9076 return -1;
9077 }
9078 else {
9079 data = _PyUnicode_AsKind(unicode, thousands_sep_kind);
9080 if (!data)
9081 return -1;
9082 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009083 }
9084
Benjamin Petersonead6b532011-12-20 17:23:42 -06009085 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009086 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009087 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
Victor Stinner41a863c2012-02-24 00:37:51 +01009088 len = asciilib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009089 (Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009090 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009091 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009092 else
Victor Stinner41a863c2012-02-24 00:37:51 +01009093 len = ucs1lib_InsertThousandsGrouping(
Victor Stinnerc3cec782011-10-05 21:24:08 +02009094 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009095 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009096 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009097 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009098 case PyUnicode_2BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009099 len = ucs2lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009100 (Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009101 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009102 (Py_UCS2 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009103 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009104 case PyUnicode_4BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009105 len = ucs4lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009106 (Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009107 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009108 (Py_UCS4 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009109 break;
9110 default:
9111 assert(0);
9112 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009113 }
Victor Stinner90f50d42012-02-24 01:44:47 +01009114 if (unicode != NULL && thousands_sep_kind != kind) {
9115 if (thousands_sep_kind < kind)
9116 PyMem_Free(thousands_sep_data);
9117 else
9118 PyMem_Free(data);
9119 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009120 if (unicode == NULL) {
9121 *maxchar = 127;
9122 if (len != n_digits) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009123 *maxchar = Py_MAX(*maxchar,
Victor Stinnere6abb482012-05-02 01:15:40 +02009124 PyUnicode_MAX_CHAR_VALUE(thousands_sep));
Victor Stinner41a863c2012-02-24 00:37:51 +01009125 }
9126 }
9127 return len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009128}
9129
9130
Alexander Belopolsky40018472011-02-26 01:02:56 +00009131Py_ssize_t
9132PyUnicode_Count(PyObject *str,
9133 PyObject *substr,
9134 Py_ssize_t start,
9135 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009136{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009137 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009138 PyObject* str_obj;
9139 PyObject* sub_obj;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009140 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009141 void *buf1 = NULL, *buf2 = NULL;
9142 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009143
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009144 str_obj = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009145 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00009146 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009147 sub_obj = PyUnicode_FromObject(substr);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009148 if (!sub_obj) {
9149 Py_DECREF(str_obj);
9150 return -1;
9151 }
Benjamin Peterson4c13a4a2012-01-02 09:07:38 -06009152 if (PyUnicode_READY(sub_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
Benjamin Peterson5e458f52012-01-02 10:12:13 -06009153 Py_DECREF(sub_obj);
Benjamin Peterson29060642009-01-31 22:14:21 +00009154 Py_DECREF(str_obj);
9155 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009156 }
Tim Petersced69f82003-09-16 20:30:58 +00009157
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009158 kind1 = PyUnicode_KIND(str_obj);
9159 kind2 = PyUnicode_KIND(sub_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009160 if (kind1 < kind2) {
9161 Py_DECREF(sub_obj);
9162 Py_DECREF(str_obj);
9163 return 0;
Benjamin Peterson1ff2e352012-05-11 17:41:20 -05009164 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009165
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009166 len1 = PyUnicode_GET_LENGTH(str_obj);
9167 len2 = PyUnicode_GET_LENGTH(sub_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009168 ADJUST_INDICES(start, end, len1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009169 if (end - start < len2) {
9170 Py_DECREF(sub_obj);
9171 Py_DECREF(str_obj);
9172 return 0;
9173 }
9174
9175 buf1 = PyUnicode_DATA(str_obj);
9176 buf2 = PyUnicode_DATA(sub_obj);
9177 if (kind2 != kind1) {
9178 buf2 = _PyUnicode_AsKind(sub_obj, kind1);
9179 if (!buf2)
9180 goto onError;
9181 }
9182
9183 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009184 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009185 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
9186 result = asciilib_count(
9187 ((Py_UCS1*)buf1) + start, end - start,
9188 buf2, len2, PY_SSIZE_T_MAX
9189 );
9190 else
9191 result = ucs1lib_count(
9192 ((Py_UCS1*)buf1) + start, end - start,
9193 buf2, len2, PY_SSIZE_T_MAX
9194 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009195 break;
9196 case PyUnicode_2BYTE_KIND:
9197 result = ucs2lib_count(
9198 ((Py_UCS2*)buf1) + start, end - start,
9199 buf2, len2, PY_SSIZE_T_MAX
9200 );
9201 break;
9202 case PyUnicode_4BYTE_KIND:
9203 result = ucs4lib_count(
9204 ((Py_UCS4*)buf1) + start, end - start,
9205 buf2, len2, PY_SSIZE_T_MAX
9206 );
9207 break;
9208 default:
9209 assert(0); result = 0;
9210 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009211
9212 Py_DECREF(sub_obj);
9213 Py_DECREF(str_obj);
9214
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009215 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009216 PyMem_Free(buf2);
9217
Guido van Rossumd57fd912000-03-10 22:53:23 +00009218 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009219 onError:
9220 Py_DECREF(sub_obj);
9221 Py_DECREF(str_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009222 if (kind2 != kind1 && buf2)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009223 PyMem_Free(buf2);
9224 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009225}
9226
Alexander Belopolsky40018472011-02-26 01:02:56 +00009227Py_ssize_t
9228PyUnicode_Find(PyObject *str,
9229 PyObject *sub,
9230 Py_ssize_t start,
9231 Py_ssize_t end,
9232 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009233{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009234 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009235
Guido van Rossumd57fd912000-03-10 22:53:23 +00009236 str = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009237 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00009238 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009239 sub = PyUnicode_FromObject(sub);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009240 if (!sub) {
9241 Py_DECREF(str);
9242 return -2;
9243 }
9244 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
9245 Py_DECREF(sub);
Benjamin Peterson29060642009-01-31 22:14:21 +00009246 Py_DECREF(str);
9247 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009248 }
Tim Petersced69f82003-09-16 20:30:58 +00009249
Victor Stinner794d5672011-10-10 03:21:36 +02009250 result = any_find_slice(direction,
9251 str, sub, start, end
9252 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00009253
Guido van Rossumd57fd912000-03-10 22:53:23 +00009254 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009255 Py_DECREF(sub);
9256
Guido van Rossumd57fd912000-03-10 22:53:23 +00009257 return result;
9258}
9259
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009260Py_ssize_t
9261PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9262 Py_ssize_t start, Py_ssize_t end,
9263 int direction)
9264{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009265 int kind;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009266 Py_ssize_t result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009267 if (PyUnicode_READY(str) == -1)
9268 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02009269 if (start < 0 || end < 0) {
9270 PyErr_SetString(PyExc_IndexError, "string index out of range");
9271 return -2;
9272 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009273 if (end > PyUnicode_GET_LENGTH(str))
9274 end = PyUnicode_GET_LENGTH(str);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009275 if (start >= end)
9276 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009277 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009278 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9279 kind, end-start, ch, direction);
9280 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009281 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009282 else
9283 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009284}
9285
Alexander Belopolsky40018472011-02-26 01:02:56 +00009286static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009287tailmatch(PyObject *self,
9288 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009289 Py_ssize_t start,
9290 Py_ssize_t end,
9291 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009292{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009293 int kind_self;
9294 int kind_sub;
9295 void *data_self;
9296 void *data_sub;
9297 Py_ssize_t offset;
9298 Py_ssize_t i;
9299 Py_ssize_t end_sub;
9300
9301 if (PyUnicode_READY(self) == -1 ||
9302 PyUnicode_READY(substring) == -1)
Victor Stinner18aa4472013-01-03 03:18:09 +01009303 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009304
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009305 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9306 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009307 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009308 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009309
Serhiy Storchakad4ea03c2015-05-31 09:15:51 +03009310 if (PyUnicode_GET_LENGTH(substring) == 0)
9311 return 1;
9312
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009313 kind_self = PyUnicode_KIND(self);
9314 data_self = PyUnicode_DATA(self);
9315 kind_sub = PyUnicode_KIND(substring);
9316 data_sub = PyUnicode_DATA(substring);
9317 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9318
9319 if (direction > 0)
9320 offset = end;
9321 else
9322 offset = start;
9323
9324 if (PyUnicode_READ(kind_self, data_self, offset) ==
9325 PyUnicode_READ(kind_sub, data_sub, 0) &&
9326 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9327 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9328 /* If both are of the same kind, memcmp is sufficient */
9329 if (kind_self == kind_sub) {
9330 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009331 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009332 data_sub,
9333 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009334 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009335 }
Martin Pantere26da7c2016-06-02 10:07:09 +00009336 /* otherwise we have to compare each character by first accessing it */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009337 else {
9338 /* We do not need to compare 0 and len(substring)-1 because
9339 the if statement above ensured already that they are equal
9340 when we end up here. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009341 for (i = 1; i < end_sub; ++i) {
9342 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9343 PyUnicode_READ(kind_sub, data_sub, i))
9344 return 0;
9345 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009346 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009347 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009348 }
9349
9350 return 0;
9351}
9352
Alexander Belopolsky40018472011-02-26 01:02:56 +00009353Py_ssize_t
9354PyUnicode_Tailmatch(PyObject *str,
9355 PyObject *substr,
9356 Py_ssize_t start,
9357 Py_ssize_t end,
9358 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009359{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009360 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009361
Guido van Rossumd57fd912000-03-10 22:53:23 +00009362 str = PyUnicode_FromObject(str);
9363 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009364 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009365 substr = PyUnicode_FromObject(substr);
9366 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009367 Py_DECREF(str);
9368 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009369 }
Tim Petersced69f82003-09-16 20:30:58 +00009370
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009371 result = tailmatch(str, substr,
Benjamin Peterson29060642009-01-31 22:14:21 +00009372 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009373 Py_DECREF(str);
9374 Py_DECREF(substr);
9375 return result;
9376}
9377
Guido van Rossumd57fd912000-03-10 22:53:23 +00009378/* Apply fixfct filter to the Unicode object self and return a
9379 reference to the modified object */
9380
Alexander Belopolsky40018472011-02-26 01:02:56 +00009381static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009382fixup(PyObject *self,
9383 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009384{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009385 PyObject *u;
9386 Py_UCS4 maxchar_old, maxchar_new = 0;
Victor Stinnereaab6042011-12-11 22:22:39 +01009387 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009388
Victor Stinnerbf6e5602011-12-12 01:53:47 +01009389 u = _PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009390 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009391 return NULL;
Victor Stinner87af4f22011-11-21 23:03:47 +01009392 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009393
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009394 /* fix functions return the new maximum character in a string,
9395 if the kind of the resulting unicode object does not change,
9396 everything is fine. Otherwise we need to change the string kind
9397 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009398 maxchar_new = fixfct(u);
Victor Stinnereaab6042011-12-11 22:22:39 +01009399
9400 if (maxchar_new == 0) {
9401 /* no changes */;
9402 if (PyUnicode_CheckExact(self)) {
9403 Py_DECREF(u);
9404 Py_INCREF(self);
9405 return self;
9406 }
9407 else
9408 return u;
9409 }
9410
Victor Stinnere6abb482012-05-02 01:15:40 +02009411 maxchar_new = align_maxchar(maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009412
Victor Stinnereaab6042011-12-11 22:22:39 +01009413 if (maxchar_new == maxchar_old)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009414 return u;
Victor Stinnereaab6042011-12-11 22:22:39 +01009415
9416 /* In case the maximum character changed, we need to
9417 convert the string to the new category. */
9418 v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
9419 if (v == NULL) {
9420 Py_DECREF(u);
9421 return NULL;
9422 }
9423 if (maxchar_new > maxchar_old) {
9424 /* If the maxchar increased so that the kind changed, not all
9425 characters are representable anymore and we need to fix the
9426 string again. This only happens in very few cases. */
Victor Stinnerd3f08822012-05-29 12:57:52 +02009427 _PyUnicode_FastCopyCharacters(v, 0,
9428 self, 0, PyUnicode_GET_LENGTH(self));
Victor Stinnereaab6042011-12-11 22:22:39 +01009429 maxchar_old = fixfct(v);
9430 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009431 }
9432 else {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009433 _PyUnicode_FastCopyCharacters(v, 0,
9434 u, 0, PyUnicode_GET_LENGTH(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009435 }
Victor Stinnereaab6042011-12-11 22:22:39 +01009436 Py_DECREF(u);
9437 assert(_PyUnicode_CheckConsistency(v, 1));
9438 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009439}
9440
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009441static PyObject *
9442ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009443{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009444 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9445 char *resdata, *data = PyUnicode_DATA(self);
9446 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009447
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009448 res = PyUnicode_New(len, 127);
9449 if (res == NULL)
9450 return NULL;
9451 resdata = PyUnicode_DATA(res);
9452 if (lower)
9453 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009454 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009455 _Py_bytes_upper(resdata, data, len);
9456 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009457}
9458
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009459static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009460handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009461{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009462 Py_ssize_t j;
9463 int final_sigma;
Victor Stinner0c39b1b2015-03-18 15:02:06 +01009464 Py_UCS4 c = 0; /* initialize to prevent gcc warning */
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009465 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009466
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009467 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9468
9469 where ! is a negation and \p{xxx} is a character with property xxx.
9470 */
9471 for (j = i - 1; j >= 0; j--) {
9472 c = PyUnicode_READ(kind, data, j);
9473 if (!_PyUnicode_IsCaseIgnorable(c))
9474 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009475 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009476 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9477 if (final_sigma) {
9478 for (j = i + 1; j < length; j++) {
9479 c = PyUnicode_READ(kind, data, j);
9480 if (!_PyUnicode_IsCaseIgnorable(c))
9481 break;
9482 }
9483 final_sigma = j == length || !_PyUnicode_IsCased(c);
9484 }
9485 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009486}
9487
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009488static int
9489lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9490 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009491{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009492 /* Obscure special case. */
9493 if (c == 0x3A3) {
9494 mapped[0] = handle_capital_sigma(kind, data, length, i);
9495 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009496 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009497 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009498}
9499
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009500static Py_ssize_t
9501do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009502{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009503 Py_ssize_t i, k = 0;
9504 int n_res, j;
9505 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009506
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009507 c = PyUnicode_READ(kind, data, 0);
9508 n_res = _PyUnicode_ToUpperFull(c, mapped);
9509 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009510 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009511 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009512 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009513 for (i = 1; i < length; i++) {
9514 c = PyUnicode_READ(kind, data, i);
9515 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9516 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009517 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009518 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009519 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009520 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009521 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009522}
9523
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009524static Py_ssize_t
9525do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9526 Py_ssize_t i, k = 0;
9527
9528 for (i = 0; i < length; i++) {
9529 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9530 int n_res, j;
9531 if (Py_UNICODE_ISUPPER(c)) {
9532 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9533 }
9534 else if (Py_UNICODE_ISLOWER(c)) {
9535 n_res = _PyUnicode_ToUpperFull(c, mapped);
9536 }
9537 else {
9538 n_res = 1;
9539 mapped[0] = c;
9540 }
9541 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009542 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009543 res[k++] = mapped[j];
9544 }
9545 }
9546 return k;
9547}
9548
9549static Py_ssize_t
9550do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9551 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009552{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009553 Py_ssize_t i, k = 0;
9554
9555 for (i = 0; i < length; i++) {
9556 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9557 int n_res, j;
9558 if (lower)
9559 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9560 else
9561 n_res = _PyUnicode_ToUpperFull(c, mapped);
9562 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009563 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009564 res[k++] = mapped[j];
9565 }
9566 }
9567 return k;
9568}
9569
9570static Py_ssize_t
9571do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9572{
9573 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9574}
9575
9576static Py_ssize_t
9577do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9578{
9579 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9580}
9581
Benjamin Petersone51757f2012-01-12 21:10:29 -05009582static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009583do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9584{
9585 Py_ssize_t i, k = 0;
9586
9587 for (i = 0; i < length; i++) {
9588 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9589 Py_UCS4 mapped[3];
9590 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9591 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009592 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -05009593 res[k++] = mapped[j];
9594 }
9595 }
9596 return k;
9597}
9598
9599static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009600do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9601{
9602 Py_ssize_t i, k = 0;
9603 int previous_is_cased;
9604
9605 previous_is_cased = 0;
9606 for (i = 0; i < length; i++) {
9607 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9608 Py_UCS4 mapped[3];
9609 int n_res, j;
9610
9611 if (previous_is_cased)
9612 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9613 else
9614 n_res = _PyUnicode_ToTitleFull(c, mapped);
9615
9616 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009617 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -05009618 res[k++] = mapped[j];
9619 }
9620
9621 previous_is_cased = _PyUnicode_IsCased(c);
9622 }
9623 return k;
9624}
9625
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009626static PyObject *
9627case_operation(PyObject *self,
9628 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9629{
9630 PyObject *res = NULL;
9631 Py_ssize_t length, newlength = 0;
9632 int kind, outkind;
9633 void *data, *outdata;
9634 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9635
Benjamin Petersoneea48462012-01-16 14:28:50 -05009636 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009637
9638 kind = PyUnicode_KIND(self);
9639 data = PyUnicode_DATA(self);
9640 length = PyUnicode_GET_LENGTH(self);
Antoine Pitrou4e334242014-10-15 23:14:53 +02009641 if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
Benjamin Petersone1bd38c2014-10-15 11:47:36 -04009642 PyErr_SetString(PyExc_OverflowError, "string is too long");
9643 return NULL;
9644 }
Benjamin Peterson1e211ff2014-10-15 12:17:21 -04009645 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009646 if (tmp == NULL)
9647 return PyErr_NoMemory();
9648 newlength = perform(kind, data, length, tmp, &maxchar);
9649 res = PyUnicode_New(newlength, maxchar);
9650 if (res == NULL)
9651 goto leave;
9652 tmpend = tmp + newlength;
9653 outdata = PyUnicode_DATA(res);
9654 outkind = PyUnicode_KIND(res);
9655 switch (outkind) {
9656 case PyUnicode_1BYTE_KIND:
9657 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9658 break;
9659 case PyUnicode_2BYTE_KIND:
9660 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9661 break;
9662 case PyUnicode_4BYTE_KIND:
9663 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9664 break;
9665 default:
9666 assert(0);
9667 break;
9668 }
9669 leave:
9670 PyMem_FREE(tmp);
9671 return res;
9672}
9673
Tim Peters8ce9f162004-08-27 01:49:32 +00009674PyObject *
9675PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009676{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009677 PyObject *sep = NULL;
Victor Stinnerdd077322011-10-07 17:02:31 +02009678 Py_ssize_t seplen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009679 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00009680 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009681 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
9682 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00009683 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009684 Py_ssize_t sz, i, res_offset;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009685 Py_UCS4 maxchar;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009686 Py_UCS4 item_maxchar;
Victor Stinnerdd077322011-10-07 17:02:31 +02009687 int use_memcpy;
9688 unsigned char *res_data = NULL, *sep_data = NULL;
9689 PyObject *last_obj;
9690 unsigned int kind = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009691
Benjamin Peterson9743b2c2014-02-15 13:02:52 -05009692 fseq = PySequence_Fast(seq, "can only join an iterable");
Tim Peters05eba1f2004-08-27 21:32:02 +00009693 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009694 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009695 }
9696
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009697 /* NOTE: the following code can't call back into Python code,
9698 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009699 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009700
Tim Peters05eba1f2004-08-27 21:32:02 +00009701 seqlen = PySequence_Fast_GET_SIZE(fseq);
9702 /* If empty sequence, return u"". */
9703 if (seqlen == 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009704 Py_DECREF(fseq);
Serhiy Storchaka678db842013-01-26 12:16:36 +02009705 _Py_RETURN_UNICODE_EMPTY();
Tim Peters05eba1f2004-08-27 21:32:02 +00009706 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009707
Tim Peters05eba1f2004-08-27 21:32:02 +00009708 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009709 last_obj = NULL;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009710 items = PySequence_Fast_ITEMS(fseq);
Victor Stinneracf47b82011-10-06 12:32:37 +02009711 if (seqlen == 1) {
9712 if (PyUnicode_CheckExact(items[0])) {
9713 res = items[0];
9714 Py_INCREF(res);
9715 Py_DECREF(fseq);
9716 return res;
9717 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009718 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009719 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009720 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009721 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009722 /* Set up sep and seplen */
9723 if (separator == NULL) {
9724 /* fall back to a blank space separator */
9725 sep = PyUnicode_FromOrdinal(' ');
9726 if (!sep)
9727 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009728 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009729 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009730 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009731 else {
9732 if (!PyUnicode_Check(separator)) {
9733 PyErr_Format(PyExc_TypeError,
9734 "separator: expected str instance,"
9735 " %.80s found",
9736 Py_TYPE(separator)->tp_name);
9737 goto onError;
9738 }
9739 if (PyUnicode_READY(separator))
9740 goto onError;
9741 sep = separator;
9742 seplen = PyUnicode_GET_LENGTH(separator);
9743 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9744 /* inc refcount to keep this code path symmetric with the
9745 above case of a blank separator */
9746 Py_INCREF(sep);
9747 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009748 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009749 }
9750
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009751 /* There are at least two things to join, or else we have a subclass
9752 * of str in the sequence.
9753 * Do a pre-pass to figure out the total amount of space we'll
9754 * need (sz), and see whether all argument are strings.
9755 */
9756 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009757#ifdef Py_DEBUG
9758 use_memcpy = 0;
9759#else
9760 use_memcpy = 1;
9761#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009762 for (i = 0; i < seqlen; i++) {
9763 const Py_ssize_t old_sz = sz;
9764 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009765 if (!PyUnicode_Check(item)) {
9766 PyErr_Format(PyExc_TypeError,
Victor Stinnera33bce02014-07-04 22:47:46 +02009767 "sequence item %zd: expected str instance,"
Benjamin Peterson29060642009-01-31 22:14:21 +00009768 " %.80s found",
9769 i, Py_TYPE(item)->tp_name);
9770 goto onError;
9771 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009772 if (PyUnicode_READY(item) == -1)
9773 goto onError;
9774 sz += PyUnicode_GET_LENGTH(item);
9775 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Benjamin Peterson7e303732013-06-10 09:19:46 -07009776 maxchar = Py_MAX(maxchar, item_maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009777 if (i != 0)
9778 sz += seplen;
9779 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9780 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009781 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009782 goto onError;
9783 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009784 if (use_memcpy && last_obj != NULL) {
9785 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9786 use_memcpy = 0;
9787 }
9788 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009789 }
Tim Petersced69f82003-09-16 20:30:58 +00009790
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009791 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009792 if (res == NULL)
9793 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009794
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009795 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009796#ifdef Py_DEBUG
9797 use_memcpy = 0;
9798#else
9799 if (use_memcpy) {
9800 res_data = PyUnicode_1BYTE_DATA(res);
9801 kind = PyUnicode_KIND(res);
9802 if (seplen != 0)
9803 sep_data = PyUnicode_1BYTE_DATA(sep);
9804 }
9805#endif
Victor Stinner4560f9c2013-04-14 18:56:46 +02009806 if (use_memcpy) {
9807 for (i = 0; i < seqlen; ++i) {
9808 Py_ssize_t itemlen;
9809 item = items[i];
9810
9811 /* Copy item, and maybe the separator. */
9812 if (i && seplen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009813 Py_MEMCPY(res_data,
9814 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009815 kind * seplen);
9816 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009817 }
Victor Stinner4560f9c2013-04-14 18:56:46 +02009818
9819 itemlen = PyUnicode_GET_LENGTH(item);
9820 if (itemlen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009821 Py_MEMCPY(res_data,
9822 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009823 kind * itemlen);
9824 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009825 }
Victor Stinner4560f9c2013-04-14 18:56:46 +02009826 }
9827 assert(res_data == PyUnicode_1BYTE_DATA(res)
9828 + kind * PyUnicode_GET_LENGTH(res));
9829 }
9830 else {
9831 for (i = 0, res_offset = 0; i < seqlen; ++i) {
9832 Py_ssize_t itemlen;
9833 item = items[i];
9834
9835 /* Copy item, and maybe the separator. */
9836 if (i && seplen != 0) {
9837 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
9838 res_offset += seplen;
9839 }
9840
9841 itemlen = PyUnicode_GET_LENGTH(item);
9842 if (itemlen != 0) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009843 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +02009844 res_offset += itemlen;
9845 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009846 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009847 assert(res_offset == PyUnicode_GET_LENGTH(res));
Victor Stinner4560f9c2013-04-14 18:56:46 +02009848 }
Tim Peters8ce9f162004-08-27 01:49:32 +00009849
Tim Peters05eba1f2004-08-27 21:32:02 +00009850 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009851 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009852 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009853 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009854
Benjamin Peterson29060642009-01-31 22:14:21 +00009855 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00009856 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009857 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00009858 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009859 return NULL;
9860}
9861
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009862#define FILL(kind, data, value, start, length) \
9863 do { \
9864 Py_ssize_t i_ = 0; \
9865 assert(kind != PyUnicode_WCHAR_KIND); \
9866 switch ((kind)) { \
9867 case PyUnicode_1BYTE_KIND: { \
9868 unsigned char * to_ = (unsigned char *)((data)) + (start); \
Victor Stinnerf2c76aa2012-05-03 13:10:40 +02009869 memset(to_, (unsigned char)value, (length)); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009870 break; \
9871 } \
9872 case PyUnicode_2BYTE_KIND: { \
9873 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
9874 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9875 break; \
9876 } \
Benjamin Petersone157cf12012-01-01 15:56:20 -06009877 case PyUnicode_4BYTE_KIND: { \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009878 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9879 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9880 break; \
9881 } \
Serhiy Storchaka133b11b2014-12-01 18:56:28 +02009882 default: assert(0); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009883 } \
9884 } while (0)
9885
Victor Stinnerd3f08822012-05-29 12:57:52 +02009886void
9887_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9888 Py_UCS4 fill_char)
9889{
9890 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
9891 const void *data = PyUnicode_DATA(unicode);
9892 assert(PyUnicode_IS_READY(unicode));
9893 assert(unicode_modifiable(unicode));
9894 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
9895 assert(start >= 0);
9896 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
9897 FILL(kind, data, fill_char, start, length);
9898}
9899
Victor Stinner3fe55312012-01-04 00:33:50 +01009900Py_ssize_t
9901PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9902 Py_UCS4 fill_char)
9903{
9904 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +01009905
9906 if (!PyUnicode_Check(unicode)) {
9907 PyErr_BadInternalCall();
9908 return -1;
9909 }
9910 if (PyUnicode_READY(unicode) == -1)
9911 return -1;
9912 if (unicode_check_modifiable(unicode))
9913 return -1;
9914
Victor Stinnerd3f08822012-05-29 12:57:52 +02009915 if (start < 0) {
9916 PyErr_SetString(PyExc_IndexError, "string index out of range");
9917 return -1;
9918 }
Victor Stinner3fe55312012-01-04 00:33:50 +01009919 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
9920 PyErr_SetString(PyExc_ValueError,
9921 "fill character is bigger than "
9922 "the string maximum character");
9923 return -1;
9924 }
9925
9926 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
9927 length = Py_MIN(maxlen, length);
9928 if (length <= 0)
9929 return 0;
9930
Victor Stinnerd3f08822012-05-29 12:57:52 +02009931 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +01009932 return length;
9933}
9934
Victor Stinner9310abb2011-10-05 00:59:23 +02009935static PyObject *
9936pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009937 Py_ssize_t left,
9938 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009939 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009940{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009941 PyObject *u;
9942 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009943 int kind;
9944 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009945
9946 if (left < 0)
9947 left = 0;
9948 if (right < 0)
9949 right = 0;
9950
Victor Stinnerc4b49542011-12-11 22:44:26 +01009951 if (left == 0 && right == 0)
9952 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009953
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009954 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9955 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00009956 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9957 return NULL;
9958 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009959 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Benjamin Peterson7e303732013-06-10 09:19:46 -07009960 maxchar = Py_MAX(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009961 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009962 if (!u)
9963 return NULL;
9964
9965 kind = PyUnicode_KIND(u);
9966 data = PyUnicode_DATA(u);
9967 if (left)
9968 FILL(kind, data, fill, 0, left);
9969 if (right)
9970 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +02009971 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009972 assert(_PyUnicode_CheckConsistency(u, 1));
9973 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009974}
9975
Alexander Belopolsky40018472011-02-26 01:02:56 +00009976PyObject *
9977PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009978{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009979 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009980
9981 string = PyUnicode_FromObject(string);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009982 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009983 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -06009984 if (PyUnicode_READY(string) == -1) {
9985 Py_DECREF(string);
9986 return NULL;
9987 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009988
Benjamin Petersonead6b532011-12-20 17:23:42 -06009989 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009990 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009991 if (PyUnicode_IS_ASCII(string))
9992 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009993 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009994 PyUnicode_GET_LENGTH(string), keepends);
9995 else
9996 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009997 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009998 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009999 break;
10000 case PyUnicode_2BYTE_KIND:
10001 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010002 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010003 PyUnicode_GET_LENGTH(string), keepends);
10004 break;
10005 case PyUnicode_4BYTE_KIND:
10006 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010007 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010008 PyUnicode_GET_LENGTH(string), keepends);
10009 break;
10010 default:
10011 assert(0);
10012 list = 0;
10013 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010014 Py_DECREF(string);
10015 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010016}
10017
Alexander Belopolsky40018472011-02-26 01:02:56 +000010018static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010019split(PyObject *self,
10020 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010021 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010022{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010023 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010024 void *buf1, *buf2;
10025 Py_ssize_t len1, len2;
10026 PyObject* out;
10027
Guido van Rossumd57fd912000-03-10 22:53:23 +000010028 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010029 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010030
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010031 if (PyUnicode_READY(self) == -1)
10032 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010033
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010034 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010035 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010036 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010037 if (PyUnicode_IS_ASCII(self))
10038 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010039 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010040 PyUnicode_GET_LENGTH(self), maxcount
10041 );
10042 else
10043 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010044 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010045 PyUnicode_GET_LENGTH(self), maxcount
10046 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010047 case PyUnicode_2BYTE_KIND:
10048 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010049 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010050 PyUnicode_GET_LENGTH(self), maxcount
10051 );
10052 case PyUnicode_4BYTE_KIND:
10053 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010054 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010055 PyUnicode_GET_LENGTH(self), maxcount
10056 );
10057 default:
10058 assert(0);
10059 return NULL;
10060 }
10061
10062 if (PyUnicode_READY(substring) == -1)
10063 return NULL;
10064
10065 kind1 = PyUnicode_KIND(self);
10066 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010067 len1 = PyUnicode_GET_LENGTH(self);
10068 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010069 if (kind1 < kind2 || len1 < len2) {
10070 out = PyList_New(1);
10071 if (out == NULL)
10072 return NULL;
10073 Py_INCREF(self);
10074 PyList_SET_ITEM(out, 0, self);
10075 return out;
10076 }
10077 buf1 = PyUnicode_DATA(self);
10078 buf2 = PyUnicode_DATA(substring);
10079 if (kind2 != kind1) {
10080 buf2 = _PyUnicode_AsKind(substring, kind1);
10081 if (!buf2)
10082 return NULL;
10083 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010084
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010085 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010086 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010087 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10088 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010089 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010090 else
10091 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010092 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010093 break;
10094 case PyUnicode_2BYTE_KIND:
10095 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010096 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010097 break;
10098 case PyUnicode_4BYTE_KIND:
10099 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010100 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010101 break;
10102 default:
10103 out = NULL;
10104 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010105 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010106 PyMem_Free(buf2);
10107 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010108}
10109
Alexander Belopolsky40018472011-02-26 01:02:56 +000010110static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010111rsplit(PyObject *self,
10112 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010113 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010114{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010115 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010116 void *buf1, *buf2;
10117 Py_ssize_t len1, len2;
10118 PyObject* out;
10119
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010120 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010121 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010122
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010123 if (PyUnicode_READY(self) == -1)
10124 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010125
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010126 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010127 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010128 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010129 if (PyUnicode_IS_ASCII(self))
10130 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010131 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010132 PyUnicode_GET_LENGTH(self), maxcount
10133 );
10134 else
10135 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010136 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010137 PyUnicode_GET_LENGTH(self), maxcount
10138 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010139 case PyUnicode_2BYTE_KIND:
10140 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010141 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010142 PyUnicode_GET_LENGTH(self), maxcount
10143 );
10144 case PyUnicode_4BYTE_KIND:
10145 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010146 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010147 PyUnicode_GET_LENGTH(self), maxcount
10148 );
10149 default:
10150 assert(0);
10151 return NULL;
10152 }
10153
10154 if (PyUnicode_READY(substring) == -1)
10155 return NULL;
10156
10157 kind1 = PyUnicode_KIND(self);
10158 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010159 len1 = PyUnicode_GET_LENGTH(self);
10160 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010161 if (kind1 < kind2 || len1 < len2) {
10162 out = PyList_New(1);
10163 if (out == NULL)
10164 return NULL;
10165 Py_INCREF(self);
10166 PyList_SET_ITEM(out, 0, self);
10167 return out;
10168 }
10169 buf1 = PyUnicode_DATA(self);
10170 buf2 = PyUnicode_DATA(substring);
10171 if (kind2 != kind1) {
10172 buf2 = _PyUnicode_AsKind(substring, kind1);
10173 if (!buf2)
10174 return NULL;
10175 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010176
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010177 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010178 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010179 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10180 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010181 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010182 else
10183 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010184 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010185 break;
10186 case PyUnicode_2BYTE_KIND:
10187 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010188 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010189 break;
10190 case PyUnicode_4BYTE_KIND:
10191 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010192 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010193 break;
10194 default:
10195 out = NULL;
10196 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010197 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010198 PyMem_Free(buf2);
10199 return out;
10200}
10201
10202static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010203anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10204 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010205{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010206 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010207 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010208 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10209 return asciilib_find(buf1, len1, buf2, len2, offset);
10210 else
10211 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010212 case PyUnicode_2BYTE_KIND:
10213 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10214 case PyUnicode_4BYTE_KIND:
10215 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10216 }
10217 assert(0);
10218 return -1;
10219}
10220
10221static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010222anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10223 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010224{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010225 switch (kind) {
10226 case PyUnicode_1BYTE_KIND:
10227 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10228 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10229 else
10230 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10231 case PyUnicode_2BYTE_KIND:
10232 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10233 case PyUnicode_4BYTE_KIND:
10234 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10235 }
10236 assert(0);
10237 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010238}
10239
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010240static void
10241replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10242 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10243{
10244 int kind = PyUnicode_KIND(u);
10245 void *data = PyUnicode_DATA(u);
10246 Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10247 if (kind == PyUnicode_1BYTE_KIND) {
10248 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10249 (Py_UCS1 *)data + len,
10250 u1, u2, maxcount);
10251 }
10252 else if (kind == PyUnicode_2BYTE_KIND) {
10253 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10254 (Py_UCS2 *)data + len,
10255 u1, u2, maxcount);
10256 }
10257 else {
10258 assert(kind == PyUnicode_4BYTE_KIND);
10259 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10260 (Py_UCS4 *)data + len,
10261 u1, u2, maxcount);
10262 }
10263}
10264
Alexander Belopolsky40018472011-02-26 01:02:56 +000010265static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010266replace(PyObject *self, PyObject *str1,
10267 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010268{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010269 PyObject *u;
10270 char *sbuf = PyUnicode_DATA(self);
10271 char *buf1 = PyUnicode_DATA(str1);
10272 char *buf2 = PyUnicode_DATA(str2);
10273 int srelease = 0, release1 = 0, release2 = 0;
10274 int skind = PyUnicode_KIND(self);
10275 int kind1 = PyUnicode_KIND(str1);
10276 int kind2 = PyUnicode_KIND(str2);
10277 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10278 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10279 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010280 int mayshrink;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010281 Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010282
10283 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010284 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010285 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010286 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010287
Victor Stinner59de0ee2011-10-07 10:01:28 +020010288 if (str1 == str2)
10289 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010290
Victor Stinner49a0a212011-10-12 23:46:10 +020010291 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010292 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10293 if (maxchar < maxchar_str1)
10294 /* substring too wide to be present */
10295 goto nothing;
Victor Stinner49a0a212011-10-12 23:46:10 +020010296 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10297 /* Replacing str1 with str2 may cause a maxchar reduction in the
10298 result string. */
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010299 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010300 maxchar = Py_MAX(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010301
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010302 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010303 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010304 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010305 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010306 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010307 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010308 Py_UCS4 u1, u2;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010309 Py_ssize_t pos;
Victor Stinnerf6441102011-12-18 02:43:08 +010010310
Victor Stinner69ed0f42013-04-09 21:48:24 +020010311 u1 = PyUnicode_READ(kind1, buf1, 0);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010312 pos = findchar(sbuf, skind, slen, u1, 1);
Victor Stinnerf6441102011-12-18 02:43:08 +010010313 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010314 goto nothing;
Victor Stinner69ed0f42013-04-09 21:48:24 +020010315 u2 = PyUnicode_READ(kind2, buf2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010316 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010317 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010318 goto error;
Victor Stinnerf6441102011-12-18 02:43:08 +010010319
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010320 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10321 replace_1char_inplace(u, pos, u1, u2, maxcount);
Victor Stinner49a0a212011-10-12 23:46:10 +020010322 }
10323 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010324 int rkind = skind;
10325 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010326 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010327
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010328 if (kind1 < rkind) {
10329 /* widen substring */
10330 buf1 = _PyUnicode_AsKind(str1, rkind);
10331 if (!buf1) goto error;
10332 release1 = 1;
10333 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010334 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010335 if (i < 0)
10336 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010337 if (rkind > kind2) {
10338 /* widen replacement */
10339 buf2 = _PyUnicode_AsKind(str2, rkind);
10340 if (!buf2) goto error;
10341 release2 = 1;
10342 }
10343 else if (rkind < kind2) {
10344 /* widen self and buf1 */
10345 rkind = kind2;
10346 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010347 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010348 sbuf = _PyUnicode_AsKind(self, rkind);
10349 if (!sbuf) goto error;
10350 srelease = 1;
10351 buf1 = _PyUnicode_AsKind(str1, rkind);
10352 if (!buf1) goto error;
10353 release1 = 1;
10354 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010355 u = PyUnicode_New(slen, maxchar);
10356 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010357 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010358 assert(PyUnicode_KIND(u) == rkind);
10359 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010360
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010361 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010362 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010363 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010364 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010365 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010366 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010367
10368 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010369 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010370 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010371 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010372 if (i == -1)
10373 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010374 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010375 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010376 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010377 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010378 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010379 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010380 }
10381 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010382 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010383 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010384 int rkind = skind;
10385 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010386
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010387 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010388 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010389 buf1 = _PyUnicode_AsKind(str1, rkind);
10390 if (!buf1) goto error;
10391 release1 = 1;
10392 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010393 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010394 if (n == 0)
10395 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010396 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010397 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010398 buf2 = _PyUnicode_AsKind(str2, rkind);
10399 if (!buf2) goto error;
10400 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010401 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010402 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010403 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010404 rkind = kind2;
10405 sbuf = _PyUnicode_AsKind(self, rkind);
10406 if (!sbuf) goto error;
10407 srelease = 1;
10408 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010409 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010410 buf1 = _PyUnicode_AsKind(str1, rkind);
10411 if (!buf1) goto error;
10412 release1 = 1;
10413 }
10414 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10415 PyUnicode_GET_LENGTH(str1))); */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010416 if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010417 PyErr_SetString(PyExc_OverflowError,
10418 "replace string is too long");
10419 goto error;
10420 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010421 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010422 if (new_size == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010423 _Py_INCREF_UNICODE_EMPTY();
10424 if (!unicode_empty)
10425 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010426 u = unicode_empty;
10427 goto done;
10428 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010429 if (new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010430 PyErr_SetString(PyExc_OverflowError,
10431 "replace string is too long");
10432 goto error;
10433 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010434 u = PyUnicode_New(new_size, maxchar);
10435 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010436 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010437 assert(PyUnicode_KIND(u) == rkind);
10438 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010439 ires = i = 0;
10440 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010441 while (n-- > 0) {
10442 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010443 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010444 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010445 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010446 if (j == -1)
10447 break;
10448 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010449 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010450 memcpy(res + rkind * ires,
10451 sbuf + rkind * i,
10452 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010453 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010454 }
10455 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010456 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010457 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010458 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010459 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010460 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010461 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010462 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010463 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010464 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010465 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010466 memcpy(res + rkind * ires,
10467 sbuf + rkind * i,
10468 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010469 }
10470 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010471 /* interleave */
10472 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010473 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010474 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010475 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010476 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010477 if (--n <= 0)
10478 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010479 memcpy(res + rkind * ires,
10480 sbuf + rkind * i,
10481 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010482 ires++;
10483 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010484 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010485 memcpy(res + rkind * ires,
10486 sbuf + rkind * i,
10487 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010488 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010489 }
10490
10491 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010492 unicode_adjust_maxchar(&u);
10493 if (u == NULL)
10494 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010495 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010496
10497 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010498 if (srelease)
10499 PyMem_FREE(sbuf);
10500 if (release1)
10501 PyMem_FREE(buf1);
10502 if (release2)
10503 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010504 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010505 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010506
Benjamin Peterson29060642009-01-31 22:14:21 +000010507 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010508 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010509 if (srelease)
10510 PyMem_FREE(sbuf);
10511 if (release1)
10512 PyMem_FREE(buf1);
10513 if (release2)
10514 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010515 return unicode_result_unchanged(self);
10516
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010517 error:
10518 if (srelease && sbuf)
10519 PyMem_FREE(sbuf);
10520 if (release1 && buf1)
10521 PyMem_FREE(buf1);
10522 if (release2 && buf2)
10523 PyMem_FREE(buf2);
10524 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010525}
10526
10527/* --- Unicode Object Methods --------------------------------------------- */
10528
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010529PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010530 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010531\n\
10532Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010533characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010534
10535static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010536unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010537{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010538 if (PyUnicode_READY(self) == -1)
10539 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010540 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010541}
10542
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010543PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010544 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010545\n\
10546Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +000010547have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010548
10549static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010550unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010551{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010552 if (PyUnicode_READY(self) == -1)
10553 return NULL;
10554 if (PyUnicode_GET_LENGTH(self) == 0)
10555 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010556 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010557}
10558
Benjamin Petersond5890c82012-01-14 13:23:30 -050010559PyDoc_STRVAR(casefold__doc__,
10560 "S.casefold() -> str\n\
10561\n\
10562Return a version of S suitable for caseless comparisons.");
10563
10564static PyObject *
10565unicode_casefold(PyObject *self)
10566{
10567 if (PyUnicode_READY(self) == -1)
10568 return NULL;
10569 if (PyUnicode_IS_ASCII(self))
10570 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010571 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010572}
10573
10574
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010575/* Argument converter. Coerces to a single unicode character */
10576
10577static int
10578convert_uc(PyObject *obj, void *addr)
10579{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010580 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010581 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010582
Benjamin Peterson14339b62009-01-31 16:36:08 +000010583 uniobj = PyUnicode_FromObject(obj);
10584 if (uniobj == NULL) {
10585 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010586 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010587 return 0;
10588 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010589 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010590 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010591 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010592 Py_DECREF(uniobj);
10593 return 0;
10594 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010595 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010596 Py_DECREF(uniobj);
10597 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010598}
10599
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010600PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010601 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010602\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010603Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010604done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010605
10606static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010607unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010608{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010609 Py_ssize_t marg, left;
10610 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010611 Py_UCS4 fillchar = ' ';
10612
Victor Stinnere9a29352011-10-01 02:14:59 +020010613 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010614 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010615
Benjamin Petersonbac79492012-01-14 13:34:47 -050010616 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010617 return NULL;
10618
Victor Stinnerc4b49542011-12-11 22:44:26 +010010619 if (PyUnicode_GET_LENGTH(self) >= width)
10620 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010621
Victor Stinnerc4b49542011-12-11 22:44:26 +010010622 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010623 left = marg / 2 + (marg & width & 1);
10624
Victor Stinner9310abb2011-10-05 00:59:23 +020010625 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010626}
10627
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010628/* This function assumes that str1 and str2 are readied by the caller. */
10629
Marc-André Lemburge5034372000-08-08 08:04:29 +000010630static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010631unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010632{
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010633#define COMPARE(TYPE1, TYPE2) \
10634 do { \
10635 TYPE1* p1 = (TYPE1 *)data1; \
10636 TYPE2* p2 = (TYPE2 *)data2; \
10637 TYPE1* end = p1 + len; \
10638 Py_UCS4 c1, c2; \
10639 for (; p1 != end; p1++, p2++) { \
10640 c1 = *p1; \
10641 c2 = *p2; \
10642 if (c1 != c2) \
10643 return (c1 < c2) ? -1 : 1; \
10644 } \
10645 } \
10646 while (0)
10647
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010648 int kind1, kind2;
10649 void *data1, *data2;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010650 Py_ssize_t len1, len2, len;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010651
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010652 kind1 = PyUnicode_KIND(str1);
10653 kind2 = PyUnicode_KIND(str2);
10654 data1 = PyUnicode_DATA(str1);
10655 data2 = PyUnicode_DATA(str2);
10656 len1 = PyUnicode_GET_LENGTH(str1);
10657 len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner770e19e2012-10-04 22:59:45 +020010658 len = Py_MIN(len1, len2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010659
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010660 switch(kind1) {
10661 case PyUnicode_1BYTE_KIND:
10662 {
10663 switch(kind2) {
10664 case PyUnicode_1BYTE_KIND:
10665 {
10666 int cmp = memcmp(data1, data2, len);
10667 /* normalize result of memcmp() into the range [-1; 1] */
10668 if (cmp < 0)
10669 return -1;
10670 if (cmp > 0)
10671 return 1;
10672 break;
Victor Stinner770e19e2012-10-04 22:59:45 +020010673 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010674 case PyUnicode_2BYTE_KIND:
10675 COMPARE(Py_UCS1, Py_UCS2);
10676 break;
10677 case PyUnicode_4BYTE_KIND:
10678 COMPARE(Py_UCS1, Py_UCS4);
10679 break;
10680 default:
10681 assert(0);
10682 }
10683 break;
10684 }
10685 case PyUnicode_2BYTE_KIND:
10686 {
10687 switch(kind2) {
10688 case PyUnicode_1BYTE_KIND:
10689 COMPARE(Py_UCS2, Py_UCS1);
10690 break;
10691 case PyUnicode_2BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010692 {
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010693 COMPARE(Py_UCS2, Py_UCS2);
10694 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010695 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010696 case PyUnicode_4BYTE_KIND:
10697 COMPARE(Py_UCS2, Py_UCS4);
10698 break;
10699 default:
10700 assert(0);
10701 }
10702 break;
10703 }
10704 case PyUnicode_4BYTE_KIND:
10705 {
10706 switch(kind2) {
10707 case PyUnicode_1BYTE_KIND:
10708 COMPARE(Py_UCS4, Py_UCS1);
10709 break;
10710 case PyUnicode_2BYTE_KIND:
10711 COMPARE(Py_UCS4, Py_UCS2);
10712 break;
10713 case PyUnicode_4BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010714 {
10715#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
10716 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
10717 /* normalize result of wmemcmp() into the range [-1; 1] */
10718 if (cmp < 0)
10719 return -1;
10720 if (cmp > 0)
10721 return 1;
10722#else
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010723 COMPARE(Py_UCS4, Py_UCS4);
Victor Stinnercd777ea2013-04-08 22:43:44 +020010724#endif
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010725 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010726 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010727 default:
10728 assert(0);
10729 }
10730 break;
10731 }
10732 default:
10733 assert(0);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010734 }
10735
Victor Stinner770e19e2012-10-04 22:59:45 +020010736 if (len1 == len2)
10737 return 0;
10738 if (len1 < len2)
10739 return -1;
10740 else
10741 return 1;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010742
10743#undef COMPARE
Marc-André Lemburge5034372000-08-08 08:04:29 +000010744}
10745
Victor Stinnerc8bc5372013-11-04 11:08:10 +010010746Py_LOCAL(int)
Victor Stinnere5567ad2012-10-23 02:48:49 +020010747unicode_compare_eq(PyObject *str1, PyObject *str2)
10748{
10749 int kind;
10750 void *data1, *data2;
10751 Py_ssize_t len;
10752 int cmp;
10753
Victor Stinnere5567ad2012-10-23 02:48:49 +020010754 len = PyUnicode_GET_LENGTH(str1);
10755 if (PyUnicode_GET_LENGTH(str2) != len)
10756 return 0;
10757 kind = PyUnicode_KIND(str1);
10758 if (PyUnicode_KIND(str2) != kind)
10759 return 0;
10760 data1 = PyUnicode_DATA(str1);
10761 data2 = PyUnicode_DATA(str2);
10762
10763 cmp = memcmp(data1, data2, len * kind);
10764 return (cmp == 0);
10765}
10766
10767
Alexander Belopolsky40018472011-02-26 01:02:56 +000010768int
10769PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010770{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010771 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10772 if (PyUnicode_READY(left) == -1 ||
10773 PyUnicode_READY(right) == -1)
10774 return -1;
Victor Stinnerf0c7b2a2013-11-04 11:27:14 +010010775
10776 /* a string is equal to itself */
10777 if (left == right)
10778 return 0;
10779
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010780 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010781 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010782 PyErr_Format(PyExc_TypeError,
10783 "Can't compare %.100s and %.100s",
10784 left->ob_type->tp_name,
10785 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010786 return -1;
10787}
10788
Martin v. Löwis5b222132007-06-10 09:51:05 +000010789int
Victor Stinnerad14ccd2013-11-07 00:46:04 +010010790_PyUnicode_CompareWithId(PyObject *left, _Py_Identifier *right)
10791{
10792 PyObject *right_str = _PyUnicode_FromId(right); /* borrowed */
10793 if (right_str == NULL)
10794 return -1;
10795 return PyUnicode_Compare(left, right_str);
10796}
10797
10798int
Martin v. Löwis5b222132007-06-10 09:51:05 +000010799PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10800{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010801 Py_ssize_t i;
10802 int kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010803 Py_UCS4 chr;
10804
Victor Stinner910337b2011-10-03 03:20:16 +020010805 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010806 if (PyUnicode_READY(uni) == -1)
10807 return -1;
10808 kind = PyUnicode_KIND(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010010809 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnera6b9b072013-10-30 18:27:13 +010010810 const void *data = PyUnicode_1BYTE_DATA(uni);
Victor Stinnere1b15922013-11-03 13:53:12 +010010811 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010010812 size_t len, len2 = strlen(str);
10813 int cmp;
10814
10815 len = Py_MIN(len1, len2);
10816 cmp = memcmp(data, str, len);
Victor Stinner21ea21e2013-11-04 11:28:26 +010010817 if (cmp != 0) {
10818 if (cmp < 0)
10819 return -1;
10820 else
10821 return 1;
10822 }
Victor Stinner602f7cf2013-10-29 23:31:50 +010010823 if (len1 > len2)
10824 return 1; /* uni is longer */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010825 if (len1 < len2)
Victor Stinner602f7cf2013-10-29 23:31:50 +010010826 return -1; /* str is longer */
10827 return 0;
10828 }
10829 else {
10830 void *data = PyUnicode_DATA(uni);
10831 /* Compare Unicode string and source character set string */
10832 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
Victor Stinner12174a52014-08-15 23:17:38 +020010833 if (chr != (unsigned char)str[i])
Victor Stinner602f7cf2013-10-29 23:31:50 +010010834 return (chr < (unsigned char)(str[i])) ? -1 : 1;
10835 /* This check keeps Python strings that end in '\0' from comparing equal
10836 to C strings identical up to that point. */
10837 if (PyUnicode_GET_LENGTH(uni) != i || chr)
10838 return 1; /* uni is longer */
10839 if (str[i])
10840 return -1; /* str is longer */
10841 return 0;
10842 }
Martin v. Löwis5b222132007-06-10 09:51:05 +000010843}
10844
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010845
Benjamin Peterson29060642009-01-31 22:14:21 +000010846#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000010847 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010848
Alexander Belopolsky40018472011-02-26 01:02:56 +000010849PyObject *
10850PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010851{
10852 int result;
Victor Stinnere5567ad2012-10-23 02:48:49 +020010853 PyObject *v;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010854
Victor Stinnere5567ad2012-10-23 02:48:49 +020010855 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
10856 Py_RETURN_NOTIMPLEMENTED;
10857
10858 if (PyUnicode_READY(left) == -1 ||
10859 PyUnicode_READY(right) == -1)
10860 return NULL;
10861
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010010862 if (left == right) {
10863 switch (op) {
10864 case Py_EQ:
10865 case Py_LE:
10866 case Py_GE:
10867 /* a string is equal to itself */
10868 v = Py_True;
10869 break;
10870 case Py_NE:
10871 case Py_LT:
10872 case Py_GT:
10873 v = Py_False;
10874 break;
10875 default:
10876 PyErr_BadArgument();
10877 return NULL;
10878 }
10879 }
10880 else if (op == Py_EQ || op == Py_NE) {
Victor Stinnere5567ad2012-10-23 02:48:49 +020010881 result = unicode_compare_eq(left, right);
Victor Stinnerc8bc5372013-11-04 11:08:10 +010010882 result ^= (op == Py_NE);
10883 v = TEST_COND(result);
Victor Stinnere5567ad2012-10-23 02:48:49 +020010884 }
10885 else {
Victor Stinner90db9c42012-10-04 21:53:50 +020010886 result = unicode_compare(left, right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010887
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010888 /* Convert the return value to a Boolean */
10889 switch (op) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010890 case Py_LE:
10891 v = TEST_COND(result <= 0);
10892 break;
10893 case Py_GE:
10894 v = TEST_COND(result >= 0);
10895 break;
10896 case Py_LT:
10897 v = TEST_COND(result == -1);
10898 break;
10899 case Py_GT:
10900 v = TEST_COND(result == 1);
10901 break;
10902 default:
10903 PyErr_BadArgument();
10904 return NULL;
10905 }
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010906 }
Victor Stinnere5567ad2012-10-23 02:48:49 +020010907 Py_INCREF(v);
10908 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010909}
10910
Alexander Belopolsky40018472011-02-26 01:02:56 +000010911int
10912PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +000010913{
Thomas Wouters477c8d52006-05-27 19:21:47 +000010914 PyObject *str, *sub;
Victor Stinner77282cb2013-04-14 19:22:47 +020010915 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010916 void *buf1, *buf2;
10917 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010918 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010919
10920 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +000010921 sub = PyUnicode_FromObject(element);
10922 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010923 PyErr_Format(PyExc_TypeError,
10924 "'in <string>' requires string as left operand, not %s",
10925 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010926 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010927 }
10928
Thomas Wouters477c8d52006-05-27 19:21:47 +000010929 str = PyUnicode_FromObject(container);
Benjamin Peterson22a29702012-01-02 09:00:30 -060010930 if (!str) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010931 Py_DECREF(sub);
10932 return -1;
10933 }
10934
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010935 kind1 = PyUnicode_KIND(str);
10936 kind2 = PyUnicode_KIND(sub);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010937 if (kind1 < kind2) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010938 Py_DECREF(sub);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010939 Py_DECREF(str);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010940 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010941 }
10942 len1 = PyUnicode_GET_LENGTH(str);
10943 len2 = PyUnicode_GET_LENGTH(sub);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010944 if (len1 < len2) {
10945 Py_DECREF(sub);
10946 Py_DECREF(str);
10947 return 0;
10948 }
10949 buf1 = PyUnicode_DATA(str);
10950 buf2 = PyUnicode_DATA(sub);
10951 if (len2 == 1) {
10952 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
10953 result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
10954 Py_DECREF(sub);
10955 Py_DECREF(str);
10956 return result;
10957 }
10958 if (kind2 != kind1) {
10959 buf2 = _PyUnicode_AsKind(sub, kind1);
10960 if (!buf2) {
10961 Py_DECREF(sub);
10962 Py_DECREF(str);
10963 return -1;
10964 }
10965 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010966
Victor Stinner77282cb2013-04-14 19:22:47 +020010967 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010968 case PyUnicode_1BYTE_KIND:
10969 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10970 break;
10971 case PyUnicode_2BYTE_KIND:
10972 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10973 break;
10974 case PyUnicode_4BYTE_KIND:
10975 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
10976 break;
10977 default:
10978 result = -1;
10979 assert(0);
10980 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010981
10982 Py_DECREF(str);
10983 Py_DECREF(sub);
10984
Victor Stinner77282cb2013-04-14 19:22:47 +020010985 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010986 PyMem_Free(buf2);
10987
Guido van Rossum403d68b2000-03-13 15:55:09 +000010988 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010989}
10990
Guido van Rossumd57fd912000-03-10 22:53:23 +000010991/* Concat to string or Unicode object giving a new Unicode object. */
10992
Alexander Belopolsky40018472011-02-26 01:02:56 +000010993PyObject *
10994PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010995{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010996 PyObject *u = NULL, *v = NULL, *w;
Victor Stinner127226b2011-10-13 01:12:34 +020010997 Py_UCS4 maxchar, maxchar2;
Victor Stinner488fa492011-12-12 00:01:39 +010010998 Py_ssize_t u_len, v_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010999
11000 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011001 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011002 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000011003 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011004 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011005 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000011006 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011007
11008 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +020011009 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011010 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011011 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011012 }
Victor Stinnera464fc12011-10-02 20:39:30 +020011013 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011014 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011015 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011016 }
11017
Victor Stinner488fa492011-12-12 00:01:39 +010011018 u_len = PyUnicode_GET_LENGTH(u);
11019 v_len = PyUnicode_GET_LENGTH(v);
11020 if (u_len > PY_SSIZE_T_MAX - v_len) {
11021 PyErr_SetString(PyExc_OverflowError,
11022 "strings are too large to concat");
11023 goto onError;
11024 }
11025 new_len = u_len + v_len;
11026
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011027 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinner127226b2011-10-13 01:12:34 +020011028 maxchar2 = PyUnicode_MAX_CHAR_VALUE(v);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011029 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011030
Guido van Rossumd57fd912000-03-10 22:53:23 +000011031 /* Concat the two Unicode strings */
Victor Stinner488fa492011-12-12 00:01:39 +010011032 w = PyUnicode_New(new_len, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011033 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000011034 goto onError;
Victor Stinnerd3f08822012-05-29 12:57:52 +020011035 _PyUnicode_FastCopyCharacters(w, 0, u, 0, u_len);
11036 _PyUnicode_FastCopyCharacters(w, u_len, v, 0, v_len);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011037 Py_DECREF(u);
11038 Py_DECREF(v);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011039 assert(_PyUnicode_CheckConsistency(w, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011040 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011041
Benjamin Peterson29060642009-01-31 22:14:21 +000011042 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000011043 Py_XDECREF(u);
11044 Py_XDECREF(v);
11045 return NULL;
11046}
11047
Walter Dörwald1ab83302007-05-18 17:15:44 +000011048void
Victor Stinner23e56682011-10-03 03:54:37 +020011049PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000011050{
Victor Stinner23e56682011-10-03 03:54:37 +020011051 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010011052 Py_UCS4 maxchar, maxchar2;
11053 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020011054
11055 if (p_left == NULL) {
11056 if (!PyErr_Occurred())
11057 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000011058 return;
11059 }
Victor Stinner23e56682011-10-03 03:54:37 +020011060 left = *p_left;
Victor Stinnerf0335102013-04-14 19:13:03 +020011061 if (right == NULL || left == NULL
11062 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
Victor Stinner23e56682011-10-03 03:54:37 +020011063 if (!PyErr_Occurred())
11064 PyErr_BadInternalCall();
11065 goto error;
11066 }
11067
Benjamin Petersonbac79492012-01-14 13:34:47 -050011068 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011069 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050011070 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011071 goto error;
11072
Victor Stinner488fa492011-12-12 00:01:39 +010011073 /* Shortcuts */
11074 if (left == unicode_empty) {
11075 Py_DECREF(left);
11076 Py_INCREF(right);
11077 *p_left = right;
11078 return;
11079 }
11080 if (right == unicode_empty)
11081 return;
11082
11083 left_len = PyUnicode_GET_LENGTH(left);
11084 right_len = PyUnicode_GET_LENGTH(right);
11085 if (left_len > PY_SSIZE_T_MAX - right_len) {
11086 PyErr_SetString(PyExc_OverflowError,
11087 "strings are too large to concat");
11088 goto error;
11089 }
11090 new_len = left_len + right_len;
11091
11092 if (unicode_modifiable(left)
11093 && PyUnicode_CheckExact(right)
11094 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020011095 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11096 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020011097 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020011098 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010011099 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11100 {
11101 /* append inplace */
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011102 if (unicode_resize(p_left, new_len) != 0)
Victor Stinner488fa492011-12-12 00:01:39 +010011103 goto error;
Victor Stinnerf0335102013-04-14 19:13:03 +020011104
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011105 /* copy 'right' into the newly allocated area of 'left' */
11106 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020011107 }
Victor Stinner488fa492011-12-12 00:01:39 +010011108 else {
11109 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11110 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011111 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020011112
Victor Stinner488fa492011-12-12 00:01:39 +010011113 /* Concat the two Unicode strings */
11114 res = PyUnicode_New(new_len, maxchar);
11115 if (res == NULL)
11116 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020011117 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11118 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010011119 Py_DECREF(left);
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011120 *p_left = res;
Victor Stinner488fa492011-12-12 00:01:39 +010011121 }
11122 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020011123 return;
11124
11125error:
Victor Stinner488fa492011-12-12 00:01:39 +010011126 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011127}
11128
11129void
11130PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11131{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011132 PyUnicode_Append(pleft, right);
11133 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011134}
11135
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011136PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011137 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011138\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011139Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011140string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011141interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011142
11143static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011144unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011145{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011146 PyObject *substring = NULL; /* initialize to fix a compiler warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011147 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011148 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011149 PyObject *result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011150 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011151 void *buf1, *buf2;
11152 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011153
Jesus Ceaac451502011-04-20 17:09:23 +020011154 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
11155 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011156 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000011157
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011158 kind1 = PyUnicode_KIND(self);
11159 kind2 = PyUnicode_KIND(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011160 if (kind1 < kind2) {
Christian Heimesd47802e2013-06-29 21:33:36 +020011161 Py_DECREF(substring);
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040011162 return PyLong_FromLong(0);
Christian Heimesd47802e2013-06-29 21:33:36 +020011163 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011164 len1 = PyUnicode_GET_LENGTH(self);
11165 len2 = PyUnicode_GET_LENGTH(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011166 ADJUST_INDICES(start, end, len1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011167 if (end - start < len2) {
11168 Py_DECREF(substring);
11169 return PyLong_FromLong(0);
11170 }
11171 buf1 = PyUnicode_DATA(self);
11172 buf2 = PyUnicode_DATA(substring);
11173 if (kind2 != kind1) {
11174 buf2 = _PyUnicode_AsKind(substring, kind1);
11175 if (!buf2) {
11176 Py_DECREF(substring);
11177 return NULL;
11178 }
11179 }
11180 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011181 case PyUnicode_1BYTE_KIND:
11182 iresult = ucs1lib_count(
11183 ((Py_UCS1*)buf1) + start, end - start,
11184 buf2, len2, PY_SSIZE_T_MAX
11185 );
11186 break;
11187 case PyUnicode_2BYTE_KIND:
11188 iresult = ucs2lib_count(
11189 ((Py_UCS2*)buf1) + start, end - start,
11190 buf2, len2, PY_SSIZE_T_MAX
11191 );
11192 break;
11193 case PyUnicode_4BYTE_KIND:
11194 iresult = ucs4lib_count(
11195 ((Py_UCS4*)buf1) + start, end - start,
11196 buf2, len2, PY_SSIZE_T_MAX
11197 );
11198 break;
11199 default:
11200 assert(0); iresult = 0;
11201 }
11202
11203 result = PyLong_FromSsize_t(iresult);
11204
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011205 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011206 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011207
11208 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011209
Guido van Rossumd57fd912000-03-10 22:53:23 +000011210 return result;
11211}
11212
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011213PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000011214 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011215\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000011216Encode S using the codec registered for encoding. Default encoding\n\
11217is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000011218handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000011219a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
11220'xmlcharrefreplace' as well as any other name registered with\n\
11221codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011222
11223static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011224unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011225{
Benjamin Peterson308d6372009-09-18 21:42:35 +000011226 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000011227 char *encoding = NULL;
11228 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000011229
Benjamin Peterson308d6372009-09-18 21:42:35 +000011230 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
11231 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011232 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011233 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011234}
11235
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011236PyDoc_STRVAR(expandtabs__doc__,
Ezio Melotti745d54d2013-11-16 19:10:57 +020011237 "S.expandtabs(tabsize=8) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011238\n\
11239Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011240If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011241
11242static PyObject*
Ezio Melotti745d54d2013-11-16 19:10:57 +020011243unicode_expandtabs(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011244{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011245 Py_ssize_t i, j, line_pos, src_len, incr;
11246 Py_UCS4 ch;
11247 PyObject *u;
11248 void *src_data, *dest_data;
Ezio Melotti745d54d2013-11-16 19:10:57 +020011249 static char *kwlist[] = {"tabsize", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000011250 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011251 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011252 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011253
Ezio Melotti745d54d2013-11-16 19:10:57 +020011254 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:expandtabs",
11255 kwlist, &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000011256 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011257
Antoine Pitrou22425222011-10-04 19:10:51 +020011258 if (PyUnicode_READY(self) == -1)
11259 return NULL;
11260
Thomas Wouters7e474022000-07-16 12:04:32 +000011261 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011262 src_len = PyUnicode_GET_LENGTH(self);
11263 i = j = line_pos = 0;
11264 kind = PyUnicode_KIND(self);
11265 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011266 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011267 for (; i < src_len; i++) {
11268 ch = PyUnicode_READ(kind, src_data, i);
11269 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011270 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011271 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011272 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011273 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011274 goto overflow;
11275 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011276 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011277 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011278 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011279 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011280 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011281 goto overflow;
11282 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011283 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011284 if (ch == '\n' || ch == '\r')
11285 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011286 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011287 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011288 if (!found)
11289 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011290
Guido van Rossumd57fd912000-03-10 22:53:23 +000011291 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011292 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011293 if (!u)
11294 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011295 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011296
Antoine Pitroue71d5742011-10-04 15:55:09 +020011297 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011298
Antoine Pitroue71d5742011-10-04 15:55:09 +020011299 for (; i < src_len; i++) {
11300 ch = PyUnicode_READ(kind, src_data, i);
11301 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011302 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011303 incr = tabsize - (line_pos % tabsize);
11304 line_pos += incr;
Victor Stinnerda79e632012-02-22 13:37:04 +010011305 FILL(kind, dest_data, ' ', j, incr);
11306 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011307 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011308 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011309 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011310 line_pos++;
11311 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011312 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011313 if (ch == '\n' || ch == '\r')
11314 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011315 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011316 }
11317 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011318 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011319
Antoine Pitroue71d5742011-10-04 15:55:09 +020011320 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011321 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11322 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011323}
11324
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011325PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011326 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011327\n\
11328Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011329such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011330arguments start and end are interpreted as in slice notation.\n\
11331\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011332Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011333
11334static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011335unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011336{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011337 /* initialize variables to prevent gcc warning */
11338 PyObject *substring = NULL;
11339 Py_ssize_t start = 0;
11340 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011341 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011342
Jesus Ceaac451502011-04-20 17:09:23 +020011343 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
11344 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011345 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011346
Christian Heimesd47802e2013-06-29 21:33:36 +020011347 if (PyUnicode_READY(self) == -1) {
11348 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011349 return NULL;
Christian Heimesd47802e2013-06-29 21:33:36 +020011350 }
11351 if (PyUnicode_READY(substring) == -1) {
11352 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011353 return NULL;
Christian Heimesd47802e2013-06-29 21:33:36 +020011354 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011355
Victor Stinner7931d9a2011-11-04 00:22:48 +010011356 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011357
11358 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011359
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011360 if (result == -2)
11361 return NULL;
11362
Christian Heimes217cfd12007-12-02 14:31:20 +000011363 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011364}
11365
11366static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011367unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011368{
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011369 void *data;
11370 enum PyUnicode_Kind kind;
11371 Py_UCS4 ch;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011372
11373 if (!PyUnicode_Check(self) || PyUnicode_READY(self) == -1) {
11374 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011375 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011376 }
11377 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11378 PyErr_SetString(PyExc_IndexError, "string index out of range");
11379 return NULL;
11380 }
11381 kind = PyUnicode_KIND(self);
11382 data = PyUnicode_DATA(self);
11383 ch = PyUnicode_READ(kind, data, index);
Victor Stinner985a82a2014-01-03 12:53:47 +010011384 return unicode_char(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011385}
11386
Guido van Rossumc2504932007-09-18 19:42:40 +000011387/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011388 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011389static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011390unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011391{
Guido van Rossumc2504932007-09-18 19:42:40 +000011392 Py_ssize_t len;
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080011393 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000011394
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011395#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011396 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011397#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011398 if (_PyUnicode_HASH(self) != -1)
11399 return _PyUnicode_HASH(self);
11400 if (PyUnicode_READY(self) == -1)
11401 return -1;
11402 len = PyUnicode_GET_LENGTH(self);
Georg Brandl16fa2a12012-02-21 00:50:13 +010011403 /*
11404 We make the hash of the empty string be 0, rather than using
11405 (prefix ^ suffix), since this slightly obfuscates the hash secret
11406 */
11407 if (len == 0) {
11408 _PyUnicode_HASH(self) = 0;
11409 return 0;
11410 }
Christian Heimes985ecdc2013-11-20 11:46:18 +010011411 x = _Py_HashBytes(PyUnicode_DATA(self),
11412 PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011413 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011414 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011415}
11416
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011417PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011418 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011419\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011420Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011421
11422static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011423unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011424{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011425 /* initialize variables to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011426 Py_ssize_t result;
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011427 PyObject *substring = NULL;
11428 Py_ssize_t start = 0;
11429 Py_ssize_t end = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011430
Jesus Ceaac451502011-04-20 17:09:23 +020011431 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
11432 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011433 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011434
Christian Heimesd47a0452013-06-29 21:21:37 +020011435 if (PyUnicode_READY(self) == -1) {
11436 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011437 return NULL;
Christian Heimesd47a0452013-06-29 21:21:37 +020011438 }
11439 if (PyUnicode_READY(substring) == -1) {
11440 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011441 return NULL;
Christian Heimesd47a0452013-06-29 21:21:37 +020011442 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011443
Victor Stinner7931d9a2011-11-04 00:22:48 +010011444 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011445
11446 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011447
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011448 if (result == -2)
11449 return NULL;
11450
Guido van Rossumd57fd912000-03-10 22:53:23 +000011451 if (result < 0) {
11452 PyErr_SetString(PyExc_ValueError, "substring not found");
11453 return NULL;
11454 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011455
Christian Heimes217cfd12007-12-02 14:31:20 +000011456 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011457}
11458
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011459PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011460 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011461\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011462Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011463at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011464
11465static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011466unicode_islower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011467{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011468 Py_ssize_t i, length;
11469 int kind;
11470 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011471 int cased;
11472
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011473 if (PyUnicode_READY(self) == -1)
11474 return NULL;
11475 length = PyUnicode_GET_LENGTH(self);
11476 kind = PyUnicode_KIND(self);
11477 data = PyUnicode_DATA(self);
11478
Guido van Rossumd57fd912000-03-10 22:53:23 +000011479 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011480 if (length == 1)
11481 return PyBool_FromLong(
11482 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011483
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011484 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011485 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011486 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011487
Guido van Rossumd57fd912000-03-10 22:53:23 +000011488 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011489 for (i = 0; i < length; i++) {
11490 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011491
Benjamin Peterson29060642009-01-31 22:14:21 +000011492 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11493 return PyBool_FromLong(0);
11494 else if (!cased && Py_UNICODE_ISLOWER(ch))
11495 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011496 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011497 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011498}
11499
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011500PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011501 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011502\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011503Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011504at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011505
11506static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011507unicode_isupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011508{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011509 Py_ssize_t i, length;
11510 int kind;
11511 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011512 int cased;
11513
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011514 if (PyUnicode_READY(self) == -1)
11515 return NULL;
11516 length = PyUnicode_GET_LENGTH(self);
11517 kind = PyUnicode_KIND(self);
11518 data = PyUnicode_DATA(self);
11519
Guido van Rossumd57fd912000-03-10 22:53:23 +000011520 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011521 if (length == 1)
11522 return PyBool_FromLong(
11523 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011524
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011525 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011526 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011527 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011528
Guido van Rossumd57fd912000-03-10 22:53:23 +000011529 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011530 for (i = 0; i < length; i++) {
11531 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011532
Benjamin Peterson29060642009-01-31 22:14:21 +000011533 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11534 return PyBool_FromLong(0);
11535 else if (!cased && Py_UNICODE_ISUPPER(ch))
11536 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011537 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011538 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011539}
11540
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011541PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011542 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011543\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011544Return True if S is a titlecased string and there is at least one\n\
11545character in S, i.e. upper- and titlecase characters may only\n\
11546follow uncased characters and lowercase characters only cased ones.\n\
11547Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011548
11549static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011550unicode_istitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011551{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011552 Py_ssize_t i, length;
11553 int kind;
11554 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011555 int cased, previous_is_cased;
11556
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011557 if (PyUnicode_READY(self) == -1)
11558 return NULL;
11559 length = PyUnicode_GET_LENGTH(self);
11560 kind = PyUnicode_KIND(self);
11561 data = PyUnicode_DATA(self);
11562
Guido van Rossumd57fd912000-03-10 22:53:23 +000011563 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011564 if (length == 1) {
11565 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11566 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11567 (Py_UNICODE_ISUPPER(ch) != 0));
11568 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011569
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011570 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011571 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011572 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011573
Guido van Rossumd57fd912000-03-10 22:53:23 +000011574 cased = 0;
11575 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011576 for (i = 0; i < length; i++) {
11577 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011578
Benjamin Peterson29060642009-01-31 22:14:21 +000011579 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11580 if (previous_is_cased)
11581 return PyBool_FromLong(0);
11582 previous_is_cased = 1;
11583 cased = 1;
11584 }
11585 else if (Py_UNICODE_ISLOWER(ch)) {
11586 if (!previous_is_cased)
11587 return PyBool_FromLong(0);
11588 previous_is_cased = 1;
11589 cased = 1;
11590 }
11591 else
11592 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011593 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011594 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011595}
11596
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011597PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011598 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011599\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011600Return True if all characters in S are whitespace\n\
11601and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011602
11603static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011604unicode_isspace(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011605{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011606 Py_ssize_t i, length;
11607 int kind;
11608 void *data;
11609
11610 if (PyUnicode_READY(self) == -1)
11611 return NULL;
11612 length = PyUnicode_GET_LENGTH(self);
11613 kind = PyUnicode_KIND(self);
11614 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011615
Guido van Rossumd57fd912000-03-10 22:53:23 +000011616 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011617 if (length == 1)
11618 return PyBool_FromLong(
11619 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011620
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011621 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011622 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011623 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011624
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011625 for (i = 0; i < length; i++) {
11626 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011627 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011628 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011629 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011630 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011631}
11632
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011633PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011634 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011635\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011636Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011637and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011638
11639static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011640unicode_isalpha(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011641{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011642 Py_ssize_t i, length;
11643 int kind;
11644 void *data;
11645
11646 if (PyUnicode_READY(self) == -1)
11647 return NULL;
11648 length = PyUnicode_GET_LENGTH(self);
11649 kind = PyUnicode_KIND(self);
11650 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011651
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011652 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011653 if (length == 1)
11654 return PyBool_FromLong(
11655 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011656
11657 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011658 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011659 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011660
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011661 for (i = 0; i < length; i++) {
11662 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011663 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011664 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011665 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011666}
11667
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011668PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011669 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011670\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011671Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011672and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011673
11674static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011675unicode_isalnum(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011676{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011677 int kind;
11678 void *data;
11679 Py_ssize_t len, i;
11680
11681 if (PyUnicode_READY(self) == -1)
11682 return NULL;
11683
11684 kind = PyUnicode_KIND(self);
11685 data = PyUnicode_DATA(self);
11686 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011687
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011688 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011689 if (len == 1) {
11690 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11691 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11692 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011693
11694 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011695 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011696 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011697
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011698 for (i = 0; i < len; i++) {
11699 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011700 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011701 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011702 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011703 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011704}
11705
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011706PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011707 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011708\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011709Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011710False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011711
11712static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011713unicode_isdecimal(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011714{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011715 Py_ssize_t i, length;
11716 int kind;
11717 void *data;
11718
11719 if (PyUnicode_READY(self) == -1)
11720 return NULL;
11721 length = PyUnicode_GET_LENGTH(self);
11722 kind = PyUnicode_KIND(self);
11723 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011724
Guido van Rossumd57fd912000-03-10 22:53:23 +000011725 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011726 if (length == 1)
11727 return PyBool_FromLong(
11728 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011729
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011730 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011731 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011732 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011733
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011734 for (i = 0; i < length; i++) {
11735 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011736 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011737 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011738 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011739}
11740
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011741PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011742 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011743\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011744Return True if all characters in S are digits\n\
11745and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011746
11747static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011748unicode_isdigit(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011749{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011750 Py_ssize_t i, length;
11751 int kind;
11752 void *data;
11753
11754 if (PyUnicode_READY(self) == -1)
11755 return NULL;
11756 length = PyUnicode_GET_LENGTH(self);
11757 kind = PyUnicode_KIND(self);
11758 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011759
Guido van Rossumd57fd912000-03-10 22:53:23 +000011760 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011761 if (length == 1) {
11762 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11763 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11764 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011765
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011766 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011767 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011768 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011769
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011770 for (i = 0; i < length; i++) {
11771 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011772 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011773 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011774 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011775}
11776
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011777PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011778 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011779\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011780Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011781False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011782
11783static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011784unicode_isnumeric(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011785{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011786 Py_ssize_t i, length;
11787 int kind;
11788 void *data;
11789
11790 if (PyUnicode_READY(self) == -1)
11791 return NULL;
11792 length = PyUnicode_GET_LENGTH(self);
11793 kind = PyUnicode_KIND(self);
11794 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011795
Guido van Rossumd57fd912000-03-10 22:53:23 +000011796 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011797 if (length == 1)
11798 return PyBool_FromLong(
11799 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011800
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011801 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011802 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011803 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011804
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011805 for (i = 0; i < length; i++) {
11806 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011807 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011808 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011809 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011810}
11811
Martin v. Löwis47383402007-08-15 07:32:56 +000011812int
11813PyUnicode_IsIdentifier(PyObject *self)
11814{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011815 int kind;
11816 void *data;
11817 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011818 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000011819
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011820 if (PyUnicode_READY(self) == -1) {
11821 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000011822 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011823 }
11824
11825 /* Special case for empty strings */
11826 if (PyUnicode_GET_LENGTH(self) == 0)
11827 return 0;
11828 kind = PyUnicode_KIND(self);
11829 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000011830
11831 /* PEP 3131 says that the first character must be in
11832 XID_Start and subsequent characters in XID_Continue,
11833 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000011834 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000011835 letters, digits, underscore). However, given the current
11836 definition of XID_Start and XID_Continue, it is sufficient
11837 to check just for these, except that _ must be allowed
11838 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011839 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050011840 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000011841 return 0;
11842
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040011843 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011844 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011845 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000011846 return 1;
11847}
11848
11849PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011850 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000011851\n\
11852Return True if S is a valid identifier according\n\
Raymond Hettinger378170d2013-03-23 08:21:12 -070011853to the language definition.\n\
11854\n\
11855Use keyword.iskeyword() to test for reserved identifiers\n\
11856such as \"def\" and \"class\".\n");
Martin v. Löwis47383402007-08-15 07:32:56 +000011857
11858static PyObject*
11859unicode_isidentifier(PyObject *self)
11860{
11861 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11862}
11863
Georg Brandl559e5d72008-06-11 18:37:52 +000011864PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011865 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000011866\n\
11867Return True if all characters in S are considered\n\
11868printable in repr() or S is empty, False otherwise.");
11869
11870static PyObject*
11871unicode_isprintable(PyObject *self)
11872{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011873 Py_ssize_t i, length;
11874 int kind;
11875 void *data;
11876
11877 if (PyUnicode_READY(self) == -1)
11878 return NULL;
11879 length = PyUnicode_GET_LENGTH(self);
11880 kind = PyUnicode_KIND(self);
11881 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000011882
11883 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011884 if (length == 1)
11885 return PyBool_FromLong(
11886 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000011887
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011888 for (i = 0; i < length; i++) {
11889 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011890 Py_RETURN_FALSE;
11891 }
11892 }
11893 Py_RETURN_TRUE;
11894}
11895
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011896PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000011897 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011898\n\
11899Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000011900iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011901
11902static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011903unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011904{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011905 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011906}
11907
Martin v. Löwis18e16552006-02-15 17:27:45 +000011908static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011909unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011910{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011911 if (PyUnicode_READY(self) == -1)
11912 return -1;
11913 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011914}
11915
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011916PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011917 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011918\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011919Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011920done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011921
11922static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011923unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011924{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011925 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011926 Py_UCS4 fillchar = ' ';
11927
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011928 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011929 return NULL;
11930
Benjamin Petersonbac79492012-01-14 13:34:47 -050011931 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010011932 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011933
Victor Stinnerc4b49542011-12-11 22:44:26 +010011934 if (PyUnicode_GET_LENGTH(self) >= width)
11935 return unicode_result_unchanged(self);
11936
11937 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011938}
11939
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011940PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011941 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011942\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011943Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011944
11945static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011946unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011947{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050011948 if (PyUnicode_READY(self) == -1)
11949 return NULL;
11950 if (PyUnicode_IS_ASCII(self))
11951 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010011952 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011953}
11954
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011955#define LEFTSTRIP 0
11956#define RIGHTSTRIP 1
11957#define BOTHSTRIP 2
11958
11959/* Arrays indexed by above */
11960static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
11961
11962#define STRIPNAME(i) (stripformat[i]+3)
11963
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011964/* externally visible for str.strip(unicode) */
11965PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011966_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011967{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011968 void *data;
11969 int kind;
11970 Py_ssize_t i, j, len;
11971 BLOOM_MASK sepmask;
Victor Stinnerb3a60142013-04-09 22:19:21 +020011972 Py_ssize_t seplen;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011973
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011974 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
11975 return NULL;
11976
11977 kind = PyUnicode_KIND(self);
11978 data = PyUnicode_DATA(self);
11979 len = PyUnicode_GET_LENGTH(self);
Victor Stinnerb3a60142013-04-09 22:19:21 +020011980 seplen = PyUnicode_GET_LENGTH(sepobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011981 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11982 PyUnicode_DATA(sepobj),
Victor Stinnerb3a60142013-04-09 22:19:21 +020011983 seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011984
Benjamin Peterson14339b62009-01-31 16:36:08 +000011985 i = 0;
11986 if (striptype != RIGHTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020011987 while (i < len) {
11988 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11989 if (!BLOOM(sepmask, ch))
11990 break;
11991 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
11992 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000011993 i++;
11994 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011995 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011996
Benjamin Peterson14339b62009-01-31 16:36:08 +000011997 j = len;
11998 if (striptype != LEFTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020011999 j--;
12000 while (j >= i) {
12001 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12002 if (!BLOOM(sepmask, ch))
12003 break;
12004 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12005 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012006 j--;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012007 }
12008
Benjamin Peterson29060642009-01-31 22:14:21 +000012009 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012010 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012011
Victor Stinner7931d9a2011-11-04 00:22:48 +010012012 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012013}
12014
12015PyObject*
12016PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12017{
12018 unsigned char *data;
12019 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020012020 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012021
Victor Stinnerde636f32011-10-01 03:55:54 +020012022 if (PyUnicode_READY(self) == -1)
12023 return NULL;
12024
Victor Stinner684d5fd2012-05-03 02:32:34 +020012025 length = PyUnicode_GET_LENGTH(self);
12026 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020012027
Victor Stinner684d5fd2012-05-03 02:32:34 +020012028 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012029 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012030
Victor Stinnerde636f32011-10-01 03:55:54 +020012031 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012032 PyErr_SetString(PyExc_IndexError, "string index out of range");
12033 return NULL;
12034 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020012035 if (start >= length || end < start)
12036 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner12bab6d2011-10-01 01:53:49 +020012037
Victor Stinner684d5fd2012-05-03 02:32:34 +020012038 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020012039 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020012040 data = PyUnicode_1BYTE_DATA(self);
Victor Stinnerd3f08822012-05-29 12:57:52 +020012041 return _PyUnicode_FromASCII((char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020012042 }
12043 else {
12044 kind = PyUnicode_KIND(self);
12045 data = PyUnicode_1BYTE_DATA(self);
12046 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012047 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020012048 length);
12049 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012050}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012051
12052static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012053do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012054{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012055 Py_ssize_t len, i, j;
12056
12057 if (PyUnicode_READY(self) == -1)
12058 return NULL;
12059
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012060 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012061
Victor Stinnercc7af722013-04-09 22:39:24 +020012062 if (PyUnicode_IS_ASCII(self)) {
12063 Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12064
12065 i = 0;
12066 if (striptype != RIGHTSTRIP) {
12067 while (i < len) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012068 Py_UCS1 ch = data[i];
Victor Stinnercc7af722013-04-09 22:39:24 +020012069 if (!_Py_ascii_whitespace[ch])
12070 break;
12071 i++;
12072 }
12073 }
12074
12075 j = len;
12076 if (striptype != LEFTSTRIP) {
12077 j--;
12078 while (j >= i) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012079 Py_UCS1 ch = data[j];
Victor Stinnercc7af722013-04-09 22:39:24 +020012080 if (!_Py_ascii_whitespace[ch])
12081 break;
12082 j--;
12083 }
12084 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012085 }
12086 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012087 else {
12088 int kind = PyUnicode_KIND(self);
12089 void *data = PyUnicode_DATA(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012090
Victor Stinnercc7af722013-04-09 22:39:24 +020012091 i = 0;
12092 if (striptype != RIGHTSTRIP) {
12093 while (i < len) {
12094 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12095 if (!Py_UNICODE_ISSPACE(ch))
12096 break;
12097 i++;
12098 }
Victor Stinner9c79e412013-04-09 22:21:08 +020012099 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012100
12101 j = len;
12102 if (striptype != LEFTSTRIP) {
12103 j--;
12104 while (j >= i) {
12105 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12106 if (!Py_UNICODE_ISSPACE(ch))
12107 break;
12108 j--;
12109 }
12110 j++;
12111 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012112 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012113
Victor Stinner7931d9a2011-11-04 00:22:48 +010012114 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012115}
12116
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012117
12118static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012119do_argstrip(PyObject *self, int striptype, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012120{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012121 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012122
Serhiy Storchakac6792272013-10-19 21:03:34 +030012123 if (!PyArg_ParseTuple(args, stripformat[striptype], &sep))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012124 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012125
Benjamin Peterson14339b62009-01-31 16:36:08 +000012126 if (sep != NULL && sep != Py_None) {
12127 if (PyUnicode_Check(sep))
12128 return _PyUnicode_XStrip(self, striptype, sep);
12129 else {
12130 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012131 "%s arg must be None or str",
12132 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012133 return NULL;
12134 }
12135 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012136
Benjamin Peterson14339b62009-01-31 16:36:08 +000012137 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012138}
12139
12140
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012141PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012142 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012143\n\
12144Return a copy of the string S with leading and trailing\n\
12145whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012146If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012147
12148static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012149unicode_strip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012150{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012151 if (PyTuple_GET_SIZE(args) == 0)
12152 return do_strip(self, BOTHSTRIP); /* Common case */
12153 else
12154 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012155}
12156
12157
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012158PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012159 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012160\n\
12161Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012162If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012163
12164static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012165unicode_lstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012166{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012167 if (PyTuple_GET_SIZE(args) == 0)
12168 return do_strip(self, LEFTSTRIP); /* Common case */
12169 else
12170 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012171}
12172
12173
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012174PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012175 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012176\n\
12177Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012178If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012179
12180static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012181unicode_rstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012182{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012183 if (PyTuple_GET_SIZE(args) == 0)
12184 return do_strip(self, RIGHTSTRIP); /* Common case */
12185 else
12186 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012187}
12188
12189
Guido van Rossumd57fd912000-03-10 22:53:23 +000012190static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012191unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012192{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012193 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012194 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012195
Serhiy Storchaka05997252013-01-26 12:14:02 +020012196 if (len < 1)
12197 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012198
Victor Stinnerc4b49542011-12-11 22:44:26 +010012199 /* no repeat, return original string */
12200 if (len == 1)
12201 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000012202
Benjamin Petersonbac79492012-01-14 13:34:47 -050012203 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012204 return NULL;
12205
Victor Stinnerc759f3e2011-10-01 03:09:58 +020012206 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020012207 PyErr_SetString(PyExc_OverflowError,
12208 "repeated string is too long");
12209 return NULL;
12210 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012211 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012212
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012213 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012214 if (!u)
12215 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012216 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012217
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012218 if (PyUnicode_GET_LENGTH(str) == 1) {
12219 const int kind = PyUnicode_KIND(str);
12220 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010012221 if (kind == PyUnicode_1BYTE_KIND) {
12222 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012223 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010012224 }
12225 else if (kind == PyUnicode_2BYTE_KIND) {
12226 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012227 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010012228 ucs2[n] = fill_char;
12229 } else {
12230 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12231 assert(kind == PyUnicode_4BYTE_KIND);
12232 for (n = 0; n < len; ++n)
12233 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012234 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012235 }
12236 else {
12237 /* number of characters copied this far */
12238 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012239 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012240 char *to = (char *) PyUnicode_DATA(u);
12241 Py_MEMCPY(to, PyUnicode_DATA(str),
12242 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000012243 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012244 n = (done <= nchars-done) ? done : nchars-done;
12245 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012246 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000012247 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012248 }
12249
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012250 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012251 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012252}
12253
Alexander Belopolsky40018472011-02-26 01:02:56 +000012254PyObject *
12255PyUnicode_Replace(PyObject *obj,
12256 PyObject *subobj,
12257 PyObject *replobj,
12258 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012259{
12260 PyObject *self;
12261 PyObject *str1;
12262 PyObject *str2;
12263 PyObject *result;
12264
12265 self = PyUnicode_FromObject(obj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012266 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012267 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012268 str1 = PyUnicode_FromObject(subobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012269 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012270 Py_DECREF(self);
12271 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012272 }
12273 str2 = PyUnicode_FromObject(replobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012274 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012275 Py_DECREF(self);
12276 Py_DECREF(str1);
12277 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012278 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060012279 if (PyUnicode_READY(self) == -1 ||
12280 PyUnicode_READY(str1) == -1 ||
12281 PyUnicode_READY(str2) == -1)
12282 result = NULL;
12283 else
12284 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012285 Py_DECREF(self);
12286 Py_DECREF(str1);
12287 Py_DECREF(str2);
12288 return result;
12289}
12290
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012291PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000012292 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012293\n\
12294Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000012295old replaced by new. If the optional argument count is\n\
12296given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012297
12298static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012299unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012300{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012301 PyObject *str1;
12302 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012303 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012304 PyObject *result;
12305
Martin v. Löwis18e16552006-02-15 17:27:45 +000012306 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012307 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060012308 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012309 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012310 str1 = PyUnicode_FromObject(str1);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012311 if (str1 == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012312 return NULL;
12313 str2 = PyUnicode_FromObject(str2);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012314 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012315 Py_DECREF(str1);
12316 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000012317 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060012318 if (PyUnicode_READY(str1) == -1 || PyUnicode_READY(str2) == -1)
12319 result = NULL;
12320 else
12321 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012322
12323 Py_DECREF(str1);
12324 Py_DECREF(str2);
12325 return result;
12326}
12327
Alexander Belopolsky40018472011-02-26 01:02:56 +000012328static PyObject *
12329unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012330{
Walter Dörwald79e913e2007-05-12 11:08:06 +000012331 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012332 Py_ssize_t isize;
12333 Py_ssize_t osize, squote, dquote, i, o;
12334 Py_UCS4 max, quote;
Victor Stinner55c08782013-04-14 18:45:39 +020012335 int ikind, okind, unchanged;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012336 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012337
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012338 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012339 return NULL;
12340
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012341 isize = PyUnicode_GET_LENGTH(unicode);
12342 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012343
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012344 /* Compute length of output, quote characters, and
12345 maximum character */
Victor Stinner55c08782013-04-14 18:45:39 +020012346 osize = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012347 max = 127;
12348 squote = dquote = 0;
12349 ikind = PyUnicode_KIND(unicode);
12350 for (i = 0; i < isize; i++) {
12351 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Benjamin Peterson736b8012014-09-29 23:02:15 -040012352 Py_ssize_t incr = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012353 switch (ch) {
Benjamin Peterson736b8012014-09-29 23:02:15 -040012354 case '\'': squote++; break;
12355 case '"': dquote++; break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012356 case '\\': case '\t': case '\r': case '\n':
Benjamin Peterson736b8012014-09-29 23:02:15 -040012357 incr = 2;
12358 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012359 default:
12360 /* Fast-path ASCII */
12361 if (ch < ' ' || ch == 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012362 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012363 else if (ch < 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012364 ;
12365 else if (Py_UNICODE_ISPRINTABLE(ch))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012366 max = ch > max ? ch : max;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012367 else if (ch < 0x100)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012368 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012369 else if (ch < 0x10000)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012370 incr = 6; /* \uHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012371 else
Benjamin Peterson736b8012014-09-29 23:02:15 -040012372 incr = 10; /* \uHHHHHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012373 }
Benjamin Peterson736b8012014-09-29 23:02:15 -040012374 if (osize > PY_SSIZE_T_MAX - incr) {
12375 PyErr_SetString(PyExc_OverflowError,
12376 "string is too long to generate repr");
12377 return NULL;
12378 }
12379 osize += incr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012380 }
12381
12382 quote = '\'';
Victor Stinner55c08782013-04-14 18:45:39 +020012383 unchanged = (osize == isize);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012384 if (squote) {
Victor Stinner55c08782013-04-14 18:45:39 +020012385 unchanged = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012386 if (dquote)
12387 /* Both squote and dquote present. Use squote,
12388 and escape them */
12389 osize += squote;
12390 else
12391 quote = '"';
12392 }
Victor Stinner55c08782013-04-14 18:45:39 +020012393 osize += 2; /* quotes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012394
12395 repr = PyUnicode_New(osize, max);
12396 if (repr == NULL)
12397 return NULL;
12398 okind = PyUnicode_KIND(repr);
12399 odata = PyUnicode_DATA(repr);
12400
12401 PyUnicode_WRITE(okind, odata, 0, quote);
12402 PyUnicode_WRITE(okind, odata, osize-1, quote);
Victor Stinner55c08782013-04-14 18:45:39 +020012403 if (unchanged) {
12404 _PyUnicode_FastCopyCharacters(repr, 1,
12405 unicode, 0,
12406 isize);
12407 }
12408 else {
12409 for (i = 0, o = 1; i < isize; i++) {
12410 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012411
Victor Stinner55c08782013-04-14 18:45:39 +020012412 /* Escape quotes and backslashes */
12413 if ((ch == quote) || (ch == '\\')) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012414 PyUnicode_WRITE(okind, odata, o++, '\\');
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012415 PyUnicode_WRITE(okind, odata, o++, ch);
Victor Stinner55c08782013-04-14 18:45:39 +020012416 continue;
12417 }
12418
12419 /* Map special whitespace to '\t', \n', '\r' */
12420 if (ch == '\t') {
12421 PyUnicode_WRITE(okind, odata, o++, '\\');
12422 PyUnicode_WRITE(okind, odata, o++, 't');
12423 }
12424 else if (ch == '\n') {
12425 PyUnicode_WRITE(okind, odata, o++, '\\');
12426 PyUnicode_WRITE(okind, odata, o++, 'n');
12427 }
12428 else if (ch == '\r') {
12429 PyUnicode_WRITE(okind, odata, o++, '\\');
12430 PyUnicode_WRITE(okind, odata, o++, 'r');
12431 }
12432
12433 /* Map non-printable US ASCII to '\xhh' */
12434 else if (ch < ' ' || ch == 0x7F) {
12435 PyUnicode_WRITE(okind, odata, o++, '\\');
12436 PyUnicode_WRITE(okind, odata, o++, 'x');
12437 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12438 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12439 }
12440
12441 /* Copy ASCII characters as-is */
12442 else if (ch < 0x7F) {
12443 PyUnicode_WRITE(okind, odata, o++, ch);
12444 }
12445
12446 /* Non-ASCII characters */
12447 else {
12448 /* Map Unicode whitespace and control characters
12449 (categories Z* and C* except ASCII space)
12450 */
12451 if (!Py_UNICODE_ISPRINTABLE(ch)) {
12452 PyUnicode_WRITE(okind, odata, o++, '\\');
12453 /* Map 8-bit characters to '\xhh' */
12454 if (ch <= 0xff) {
12455 PyUnicode_WRITE(okind, odata, o++, 'x');
12456 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12457 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12458 }
12459 /* Map 16-bit characters to '\uxxxx' */
12460 else if (ch <= 0xffff) {
12461 PyUnicode_WRITE(okind, odata, o++, 'u');
12462 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12463 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12464 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12465 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12466 }
12467 /* Map 21-bit characters to '\U00xxxxxx' */
12468 else {
12469 PyUnicode_WRITE(okind, odata, o++, 'U');
12470 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12471 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12472 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12473 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12474 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12475 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12476 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12477 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12478 }
12479 }
12480 /* Copy characters as-is */
12481 else {
12482 PyUnicode_WRITE(okind, odata, o++, ch);
12483 }
Georg Brandl559e5d72008-06-11 18:37:52 +000012484 }
12485 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012486 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012487 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012488 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012489 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012490}
12491
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012492PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012493 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012494\n\
12495Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012496such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012497arguments start and end are interpreted as in slice notation.\n\
12498\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012499Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012500
12501static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012502unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012503{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012504 /* initialize variables to prevent gcc warning */
12505 PyObject *substring = NULL;
12506 Py_ssize_t start = 0;
12507 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012508 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012509
Jesus Ceaac451502011-04-20 17:09:23 +020012510 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
12511 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012512 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012513
Christian Heimesea71a522013-06-29 21:17:34 +020012514 if (PyUnicode_READY(self) == -1) {
12515 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012516 return NULL;
Christian Heimesea71a522013-06-29 21:17:34 +020012517 }
12518 if (PyUnicode_READY(substring) == -1) {
12519 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012520 return NULL;
Christian Heimesea71a522013-06-29 21:17:34 +020012521 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012522
Victor Stinner7931d9a2011-11-04 00:22:48 +010012523 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012524
12525 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012526
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012527 if (result == -2)
12528 return NULL;
12529
Christian Heimes217cfd12007-12-02 14:31:20 +000012530 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012531}
12532
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012533PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012534 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012535\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012536Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012537
12538static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012539unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012540{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012541 /* initialize variables to prevent gcc warning */
12542 PyObject *substring = NULL;
12543 Py_ssize_t start = 0;
12544 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012545 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012546
Jesus Ceaac451502011-04-20 17:09:23 +020012547 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
12548 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012549 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012550
Christian Heimesea71a522013-06-29 21:17:34 +020012551 if (PyUnicode_READY(self) == -1) {
12552 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012553 return NULL;
Christian Heimesea71a522013-06-29 21:17:34 +020012554 }
12555 if (PyUnicode_READY(substring) == -1) {
12556 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012557 return NULL;
Christian Heimesea71a522013-06-29 21:17:34 +020012558 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012559
Victor Stinner7931d9a2011-11-04 00:22:48 +010012560 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012561
12562 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012563
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012564 if (result == -2)
12565 return NULL;
12566
Guido van Rossumd57fd912000-03-10 22:53:23 +000012567 if (result < 0) {
12568 PyErr_SetString(PyExc_ValueError, "substring not found");
12569 return NULL;
12570 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012571
Christian Heimes217cfd12007-12-02 14:31:20 +000012572 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012573}
12574
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012575PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012576 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012577\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012578Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012579done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012580
12581static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012582unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012583{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012584 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012585 Py_UCS4 fillchar = ' ';
12586
Victor Stinnere9a29352011-10-01 02:14:59 +020012587 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012588 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012589
Benjamin Petersonbac79492012-01-14 13:34:47 -050012590 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012591 return NULL;
12592
Victor Stinnerc4b49542011-12-11 22:44:26 +010012593 if (PyUnicode_GET_LENGTH(self) >= width)
12594 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012595
Victor Stinnerc4b49542011-12-11 22:44:26 +010012596 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012597}
12598
Alexander Belopolsky40018472011-02-26 01:02:56 +000012599PyObject *
12600PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012601{
12602 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000012603
Guido van Rossumd57fd912000-03-10 22:53:23 +000012604 s = PyUnicode_FromObject(s);
12605 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012606 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012607 if (sep != NULL) {
12608 sep = PyUnicode_FromObject(sep);
12609 if (sep == NULL) {
12610 Py_DECREF(s);
12611 return NULL;
12612 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012613 }
12614
Victor Stinner9310abb2011-10-05 00:59:23 +020012615 result = split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012616
12617 Py_DECREF(s);
12618 Py_XDECREF(sep);
12619 return result;
12620}
12621
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012622PyDoc_STRVAR(split__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012623 "S.split(sep=None, maxsplit=-1) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012624\n\
12625Return a list of the words in S, using sep as the\n\
12626delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000012627splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000012628whitespace string is a separator and empty strings are\n\
12629removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012630
12631static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012632unicode_split(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012633{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012634 static char *kwlist[] = {"sep", "maxsplit", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000012635 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012636 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012637
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012638 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:split",
12639 kwlist, &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012640 return NULL;
12641
12642 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012643 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012644 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012645 return split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012646 else
Victor Stinner7931d9a2011-11-04 00:22:48 +010012647 return PyUnicode_Split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012648}
12649
Thomas Wouters477c8d52006-05-27 19:21:47 +000012650PyObject *
12651PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
12652{
12653 PyObject* str_obj;
12654 PyObject* sep_obj;
12655 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012656 int kind1, kind2;
12657 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012658 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012659
12660 str_obj = PyUnicode_FromObject(str_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012661 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012662 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012663 sep_obj = PyUnicode_FromObject(sep_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012664 if (!sep_obj) {
12665 Py_DECREF(str_obj);
12666 return NULL;
12667 }
12668 if (PyUnicode_READY(sep_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
12669 Py_DECREF(sep_obj);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012670 Py_DECREF(str_obj);
12671 return NULL;
12672 }
12673
Victor Stinner14f8f022011-10-05 20:58:25 +020012674 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012675 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012676 len1 = PyUnicode_GET_LENGTH(str_obj);
12677 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012678 if (kind1 < kind2 || len1 < len2) {
12679 _Py_INCREF_UNICODE_EMPTY();
12680 if (!unicode_empty)
12681 out = NULL;
12682 else {
12683 out = PyTuple_Pack(3, str_obj, unicode_empty, unicode_empty);
12684 Py_DECREF(unicode_empty);
12685 }
12686 Py_DECREF(sep_obj);
12687 Py_DECREF(str_obj);
12688 return out;
12689 }
12690 buf1 = PyUnicode_DATA(str_obj);
12691 buf2 = PyUnicode_DATA(sep_obj);
12692 if (kind2 != kind1) {
12693 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
12694 if (!buf2)
12695 goto onError;
12696 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012697
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012698 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012699 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012700 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12701 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12702 else
12703 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012704 break;
12705 case PyUnicode_2BYTE_KIND:
12706 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12707 break;
12708 case PyUnicode_4BYTE_KIND:
12709 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12710 break;
12711 default:
12712 assert(0);
12713 out = 0;
12714 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012715
12716 Py_DECREF(sep_obj);
12717 Py_DECREF(str_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012718 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012719 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012720
12721 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012722 onError:
12723 Py_DECREF(sep_obj);
12724 Py_DECREF(str_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012725 if (kind2 != kind1 && buf2)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012726 PyMem_Free(buf2);
12727 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012728}
12729
12730
12731PyObject *
12732PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
12733{
12734 PyObject* str_obj;
12735 PyObject* sep_obj;
12736 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012737 int kind1, kind2;
12738 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012739 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012740
12741 str_obj = PyUnicode_FromObject(str_in);
12742 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012743 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012744 sep_obj = PyUnicode_FromObject(sep_in);
12745 if (!sep_obj) {
12746 Py_DECREF(str_obj);
12747 return NULL;
12748 }
12749
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012750 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012751 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012752 len1 = PyUnicode_GET_LENGTH(str_obj);
12753 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012754 if (kind1 < kind2 || len1 < len2) {
12755 _Py_INCREF_UNICODE_EMPTY();
12756 if (!unicode_empty)
12757 out = NULL;
12758 else {
12759 out = PyTuple_Pack(3, unicode_empty, unicode_empty, str_obj);
12760 Py_DECREF(unicode_empty);
12761 }
12762 Py_DECREF(sep_obj);
12763 Py_DECREF(str_obj);
12764 return out;
12765 }
12766 buf1 = PyUnicode_DATA(str_obj);
12767 buf2 = PyUnicode_DATA(sep_obj);
12768 if (kind2 != kind1) {
12769 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
12770 if (!buf2)
12771 goto onError;
12772 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012773
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012774 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012775 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012776 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12777 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12778 else
12779 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012780 break;
12781 case PyUnicode_2BYTE_KIND:
12782 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12783 break;
12784 case PyUnicode_4BYTE_KIND:
12785 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12786 break;
12787 default:
12788 assert(0);
12789 out = 0;
12790 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012791
12792 Py_DECREF(sep_obj);
12793 Py_DECREF(str_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012794 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012795 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012796
12797 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012798 onError:
12799 Py_DECREF(sep_obj);
12800 Py_DECREF(str_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012801 if (kind2 != kind1 && buf2)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012802 PyMem_Free(buf2);
12803 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012804}
12805
12806PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012807 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012808\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012809Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012810the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012811found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012812
12813static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012814unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012815{
Victor Stinner9310abb2011-10-05 00:59:23 +020012816 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012817}
12818
12819PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000012820 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012821\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012822Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012823the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012824separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012825
12826static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012827unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012828{
Victor Stinner9310abb2011-10-05 00:59:23 +020012829 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012830}
12831
Alexander Belopolsky40018472011-02-26 01:02:56 +000012832PyObject *
12833PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012834{
12835 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012836
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012837 s = PyUnicode_FromObject(s);
12838 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012839 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012840 if (sep != NULL) {
12841 sep = PyUnicode_FromObject(sep);
12842 if (sep == NULL) {
12843 Py_DECREF(s);
12844 return NULL;
12845 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012846 }
12847
Victor Stinner9310abb2011-10-05 00:59:23 +020012848 result = rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012849
12850 Py_DECREF(s);
12851 Py_XDECREF(sep);
12852 return result;
12853}
12854
12855PyDoc_STRVAR(rsplit__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012856 "S.rsplit(sep=None, maxsplit=-1) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012857\n\
12858Return a list of the words in S, using sep as the\n\
12859delimiter string, starting at the end of the string and\n\
12860working to the front. If maxsplit is given, at most maxsplit\n\
12861splits are done. If sep is not specified, any whitespace string\n\
12862is a separator.");
12863
12864static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012865unicode_rsplit(PyObject *self, PyObject *args, PyObject *kwds)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012866{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012867 static char *kwlist[] = {"sep", "maxsplit", 0};
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012868 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012869 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012870
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012871 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:rsplit",
12872 kwlist, &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012873 return NULL;
12874
12875 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012876 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012877 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012878 return rsplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012879 else
Victor Stinner9310abb2011-10-05 00:59:23 +020012880 return PyUnicode_RSplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012881}
12882
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012883PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012884 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012885\n\
12886Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000012887Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012888is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012889
12890static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012891unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012892{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012893 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000012894 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012895
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012896 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12897 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012898 return NULL;
12899
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012900 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012901}
12902
12903static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000012904PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012905{
Victor Stinnerc4b49542011-12-11 22:44:26 +010012906 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012907}
12908
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012909PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012910 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012911\n\
12912Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012913and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012914
12915static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012916unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012917{
Benjamin Petersoneea48462012-01-16 14:28:50 -050012918 if (PyUnicode_READY(self) == -1)
12919 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012920 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012921}
12922
Larry Hastings61272b72014-01-07 12:41:53 -080012923/*[clinic input]
Georg Brandlceee0772007-11-27 23:48:05 +000012924
Larry Hastings31826802013-10-19 00:09:25 -070012925@staticmethod
12926str.maketrans as unicode_maketrans
12927
12928 x: object
12929
12930 y: unicode=NULL
12931
12932 z: unicode=NULL
12933
12934 /
12935
12936Return a translation table usable for str.translate().
12937
12938If there is only one argument, it must be a dictionary mapping Unicode
12939ordinals (integers) or characters to Unicode ordinals, strings or None.
12940Character keys will be then converted to ordinals.
12941If there are two arguments, they must be strings of equal length, and
12942in the resulting dictionary, each character in x will be mapped to the
12943character at the same position in y. If there is a third argument, it
12944must be a string, whose characters will be mapped to None in the result.
Larry Hastings61272b72014-01-07 12:41:53 -080012945[clinic start generated code]*/
Larry Hastings31826802013-10-19 00:09:25 -070012946
Larry Hastings31826802013-10-19 00:09:25 -070012947static PyObject *
Larry Hastings5c661892014-01-24 06:17:25 -080012948unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
Serhiy Storchaka1009bf12015-04-03 23:53:51 +030012949/*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
Larry Hastings31826802013-10-19 00:09:25 -070012950{
Georg Brandlceee0772007-11-27 23:48:05 +000012951 PyObject *new = NULL, *key, *value;
12952 Py_ssize_t i = 0;
12953 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012954
Georg Brandlceee0772007-11-27 23:48:05 +000012955 new = PyDict_New();
12956 if (!new)
12957 return NULL;
12958 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012959 int x_kind, y_kind, z_kind;
12960 void *x_data, *y_data, *z_data;
12961
Georg Brandlceee0772007-11-27 23:48:05 +000012962 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000012963 if (!PyUnicode_Check(x)) {
12964 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12965 "be a string if there is a second argument");
12966 goto err;
12967 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012968 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012969 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12970 "arguments must have equal length");
12971 goto err;
12972 }
12973 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012974 x_kind = PyUnicode_KIND(x);
12975 y_kind = PyUnicode_KIND(y);
12976 x_data = PyUnicode_DATA(x);
12977 y_data = PyUnicode_DATA(y);
12978 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
12979 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012980 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000012981 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060012982 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012983 if (!value) {
12984 Py_DECREF(key);
12985 goto err;
12986 }
Georg Brandlceee0772007-11-27 23:48:05 +000012987 res = PyDict_SetItem(new, key, value);
12988 Py_DECREF(key);
12989 Py_DECREF(value);
12990 if (res < 0)
12991 goto err;
12992 }
12993 /* create entries for deleting chars in z */
12994 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012995 z_kind = PyUnicode_KIND(z);
12996 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012997 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012998 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012999 if (!key)
13000 goto err;
13001 res = PyDict_SetItem(new, key, Py_None);
13002 Py_DECREF(key);
13003 if (res < 0)
13004 goto err;
13005 }
13006 }
13007 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013008 int kind;
13009 void *data;
13010
Georg Brandlceee0772007-11-27 23:48:05 +000013011 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000013012 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013013 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13014 "to maketrans it must be a dict");
13015 goto err;
13016 }
13017 /* copy entries into the new dict, converting string keys to int keys */
13018 while (PyDict_Next(x, &i, &key, &value)) {
13019 if (PyUnicode_Check(key)) {
13020 /* convert string keys to integer keys */
13021 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013022 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000013023 PyErr_SetString(PyExc_ValueError, "string keys in translate "
13024 "table must be of length 1");
13025 goto err;
13026 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013027 kind = PyUnicode_KIND(key);
13028 data = PyUnicode_DATA(key);
13029 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000013030 if (!newkey)
13031 goto err;
13032 res = PyDict_SetItem(new, newkey, value);
13033 Py_DECREF(newkey);
13034 if (res < 0)
13035 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000013036 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013037 /* just keep integer keys */
13038 if (PyDict_SetItem(new, key, value) < 0)
13039 goto err;
13040 } else {
13041 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13042 "be strings or integers");
13043 goto err;
13044 }
13045 }
13046 }
13047 return new;
13048 err:
13049 Py_DECREF(new);
13050 return NULL;
13051}
13052
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013053PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013054 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013055\n\
Zachary Ware79b98df2015-08-05 23:54:15 -050013056Return a copy of the string S in which each character has been mapped\n\
13057through the given translation table. The table must implement\n\
13058lookup/indexing via __getitem__, for instance a dictionary or list,\n\
13059mapping Unicode ordinals to Unicode ordinals, strings, or None. If\n\
13060this operation raises LookupError, the character is left untouched.\n\
13061Characters mapped to None are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013062
13063static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013064unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013065{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013066 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013067}
13068
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013069PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013070 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013071\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013072Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013073
13074static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020013075unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013076{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050013077 if (PyUnicode_READY(self) == -1)
13078 return NULL;
13079 if (PyUnicode_IS_ASCII(self))
13080 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013081 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013082}
13083
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013084PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013085 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013086\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000013087Pad a numeric string S with zeros on the left, to fill a field\n\
13088of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013089
13090static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020013091unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013092{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013093 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020013094 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013095 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013096 int kind;
13097 void *data;
13098 Py_UCS4 chr;
13099
Martin v. Löwis18e16552006-02-15 17:27:45 +000013100 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000013101 return NULL;
13102
Benjamin Petersonbac79492012-01-14 13:34:47 -050013103 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010013104 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013105
Victor Stinnerc4b49542011-12-11 22:44:26 +010013106 if (PyUnicode_GET_LENGTH(self) >= width)
13107 return unicode_result_unchanged(self);
13108
13109 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013110
13111 u = pad(self, fill, 0, '0');
13112
Walter Dörwald068325e2002-04-15 13:36:47 +000013113 if (u == NULL)
13114 return NULL;
13115
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013116 kind = PyUnicode_KIND(u);
13117 data = PyUnicode_DATA(u);
13118 chr = PyUnicode_READ(kind, data, fill);
13119
13120 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000013121 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013122 PyUnicode_WRITE(kind, data, 0, chr);
13123 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000013124 }
13125
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013126 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010013127 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013128}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013129
13130#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013131static PyObject *
13132unicode__decimal2ascii(PyObject *self)
13133{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013134 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013135}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013136#endif
13137
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013138PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013139 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013140\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013141Return True if S starts with the specified prefix, False otherwise.\n\
13142With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013143With optional end, stop comparing S at that position.\n\
13144prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013145
13146static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013147unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013148 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013149{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013150 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013151 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013152 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013153 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013154 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013155
Jesus Ceaac451502011-04-20 17:09:23 +020013156 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013157 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013158 if (PyTuple_Check(subobj)) {
13159 Py_ssize_t i;
13160 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013161 substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013162 if (substring == NULL)
13163 return NULL;
13164 result = tailmatch(self, substring, start, end, -1);
13165 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010013166 if (result == -1)
13167 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013168 if (result) {
13169 Py_RETURN_TRUE;
13170 }
13171 }
13172 /* nothing matched */
13173 Py_RETURN_FALSE;
13174 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013175 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030013176 if (substring == NULL) {
13177 if (PyErr_ExceptionMatches(PyExc_TypeError))
13178 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
13179 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013180 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013181 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013182 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013183 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010013184 if (result == -1)
13185 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013186 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013187}
13188
13189
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013190PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013191 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013192\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013193Return True if S ends with the specified suffix, False otherwise.\n\
13194With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013195With optional end, stop comparing S at that position.\n\
13196suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013197
13198static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013199unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013200 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013201{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013202 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013203 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013204 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013205 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013206 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013207
Jesus Ceaac451502011-04-20 17:09:23 +020013208 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013209 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013210 if (PyTuple_Check(subobj)) {
13211 Py_ssize_t i;
13212 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013213 substring = PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000013214 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013215 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000013216 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013217 result = tailmatch(self, substring, start, end, +1);
13218 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010013219 if (result == -1)
13220 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013221 if (result) {
13222 Py_RETURN_TRUE;
13223 }
13224 }
13225 Py_RETURN_FALSE;
13226 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013227 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030013228 if (substring == NULL) {
13229 if (PyErr_ExceptionMatches(PyExc_TypeError))
13230 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
13231 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013232 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013233 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013234 result = tailmatch(self, substring, start, end, +1);
Christian Heimes305e49e2013-06-29 20:41:06 +020013235 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010013236 if (result == -1)
13237 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013238 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013239}
13240
Victor Stinner202fdca2012-05-07 12:47:02 +020013241Py_LOCAL_INLINE(void)
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013242_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013243{
Victor Stinner8f674cc2013-04-17 23:02:17 +020013244 if (!writer->readonly)
13245 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
13246 else {
13247 /* Copy-on-write mode: set buffer size to 0 so
13248 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13249 * next write. */
13250 writer->size = 0;
13251 }
Victor Stinner202fdca2012-05-07 12:47:02 +020013252 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13253 writer->data = PyUnicode_DATA(writer->buffer);
13254 writer->kind = PyUnicode_KIND(writer->buffer);
13255}
13256
Victor Stinnerd3f08822012-05-29 12:57:52 +020013257void
Victor Stinner8f674cc2013-04-17 23:02:17 +020013258_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013259{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013260 memset(writer, 0, sizeof(*writer));
13261#ifdef Py_DEBUG
13262 writer->kind = 5; /* invalid kind */
13263#endif
Victor Stinner8f674cc2013-04-17 23:02:17 +020013264 writer->min_char = 127;
Victor Stinner202fdca2012-05-07 12:47:02 +020013265}
13266
Victor Stinnerd3f08822012-05-29 12:57:52 +020013267int
13268_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13269 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020013270{
Victor Stinner6989ba02013-11-18 21:08:39 +010013271#ifdef MS_WINDOWS
13272 /* On Windows, overallocate by 50% is the best factor */
13273# define OVERALLOCATE_FACTOR 2
13274#else
13275 /* On Linux, overallocate by 25% is the best factor */
13276# define OVERALLOCATE_FACTOR 4
13277#endif
Victor Stinner202fdca2012-05-07 12:47:02 +020013278 Py_ssize_t newlen;
13279 PyObject *newbuffer;
13280
Victor Stinnerd3f08822012-05-29 12:57:52 +020013281 assert(length > 0);
13282
Victor Stinner202fdca2012-05-07 12:47:02 +020013283 if (length > PY_SSIZE_T_MAX - writer->pos) {
13284 PyErr_NoMemory();
13285 return -1;
13286 }
13287 newlen = writer->pos + length;
13288
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070013289 maxchar = Py_MAX(maxchar, writer->min_char);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013290
Victor Stinnerd3f08822012-05-29 12:57:52 +020013291 if (writer->buffer == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020013292 assert(!writer->readonly);
Victor Stinner6989ba02013-11-18 21:08:39 +010013293 if (writer->overallocate
13294 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13295 /* overallocate to limit the number of realloc() */
13296 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013297 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013298 if (newlen < writer->min_length)
13299 newlen = writer->min_length;
13300
Victor Stinnerd3f08822012-05-29 12:57:52 +020013301 writer->buffer = PyUnicode_New(newlen, maxchar);
13302 if (writer->buffer == NULL)
13303 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013304 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013305 else if (newlen > writer->size) {
Victor Stinner6989ba02013-11-18 21:08:39 +010013306 if (writer->overallocate
13307 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13308 /* overallocate to limit the number of realloc() */
13309 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013310 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013311 if (newlen < writer->min_length)
13312 newlen = writer->min_length;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013313
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013314 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020013315 /* resize + widen */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +030013316 maxchar = Py_MAX(maxchar, writer->maxchar);
Victor Stinner202fdca2012-05-07 12:47:02 +020013317 newbuffer = PyUnicode_New(newlen, maxchar);
13318 if (newbuffer == NULL)
13319 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013320 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13321 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020013322 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013323 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020013324 }
13325 else {
13326 newbuffer = resize_compact(writer->buffer, newlen);
13327 if (newbuffer == NULL)
13328 return -1;
13329 }
13330 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013331 }
13332 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013333 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013334 newbuffer = PyUnicode_New(writer->size, maxchar);
13335 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020013336 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013337 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13338 writer->buffer, 0, writer->pos);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030013339 Py_SETREF(writer->buffer, newbuffer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013340 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013341 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013342 return 0;
Victor Stinner6989ba02013-11-18 21:08:39 +010013343
13344#undef OVERALLOCATE_FACTOR
Victor Stinner202fdca2012-05-07 12:47:02 +020013345}
13346
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013347Py_LOCAL_INLINE(int)
13348_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
Victor Stinnera0dd0212013-04-11 22:09:04 +020013349{
13350 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13351 return -1;
13352 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13353 writer->pos++;
13354 return 0;
13355}
13356
13357int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013358_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13359{
13360 return _PyUnicodeWriter_WriteCharInline(writer, ch);
13361}
13362
13363int
Victor Stinnerd3f08822012-05-29 12:57:52 +020013364_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13365{
13366 Py_UCS4 maxchar;
13367 Py_ssize_t len;
13368
13369 if (PyUnicode_READY(str) == -1)
13370 return -1;
13371 len = PyUnicode_GET_LENGTH(str);
13372 if (len == 0)
13373 return 0;
13374 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13375 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013376 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinner1912b392015-03-26 09:37:23 +010013377 assert(_PyUnicode_CheckConsistency(str, 1));
Victor Stinner8f674cc2013-04-17 23:02:17 +020013378 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013379 Py_INCREF(str);
13380 writer->buffer = str;
13381 _PyUnicodeWriter_Update(writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013382 writer->pos += len;
13383 return 0;
13384 }
13385 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13386 return -1;
13387 }
13388 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13389 str, 0, len);
13390 writer->pos += len;
13391 return 0;
13392}
13393
Victor Stinnere215d962012-10-06 23:03:36 +020013394int
Victor Stinnercfc4c132013-04-03 01:48:39 +020013395_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13396 Py_ssize_t start, Py_ssize_t end)
13397{
13398 Py_UCS4 maxchar;
13399 Py_ssize_t len;
13400
13401 if (PyUnicode_READY(str) == -1)
13402 return -1;
13403
13404 assert(0 <= start);
13405 assert(end <= PyUnicode_GET_LENGTH(str));
13406 assert(start <= end);
13407
13408 if (end == 0)
13409 return 0;
13410
13411 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
13412 return _PyUnicodeWriter_WriteStr(writer, str);
13413
13414 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
13415 maxchar = _PyUnicode_FindMaxChar(str, start, end);
13416 else
13417 maxchar = writer->maxchar;
13418 len = end - start;
13419
13420 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
13421 return -1;
13422
13423 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13424 str, start, len);
13425 writer->pos += len;
13426 return 0;
13427}
13428
13429int
Victor Stinner4a587072013-11-19 12:54:53 +010013430_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
13431 const char *ascii, Py_ssize_t len)
13432{
13433 if (len == -1)
13434 len = strlen(ascii);
13435
13436 assert(ucs1lib_find_max_char((Py_UCS1*)ascii, (Py_UCS1*)ascii + len) < 128);
13437
13438 if (writer->buffer == NULL && !writer->overallocate) {
13439 PyObject *str;
13440
13441 str = _PyUnicode_FromASCII(ascii, len);
13442 if (str == NULL)
13443 return -1;
13444
13445 writer->readonly = 1;
13446 writer->buffer = str;
13447 _PyUnicodeWriter_Update(writer);
13448 writer->pos += len;
13449 return 0;
13450 }
13451
13452 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
13453 return -1;
13454
13455 switch (writer->kind)
13456 {
13457 case PyUnicode_1BYTE_KIND:
13458 {
13459 const Py_UCS1 *str = (const Py_UCS1 *)ascii;
13460 Py_UCS1 *data = writer->data;
13461
13462 Py_MEMCPY(data + writer->pos, str, len);
13463 break;
13464 }
13465 case PyUnicode_2BYTE_KIND:
13466 {
13467 _PyUnicode_CONVERT_BYTES(
13468 Py_UCS1, Py_UCS2,
13469 ascii, ascii + len,
13470 (Py_UCS2 *)writer->data + writer->pos);
13471 break;
13472 }
13473 case PyUnicode_4BYTE_KIND:
13474 {
13475 _PyUnicode_CONVERT_BYTES(
13476 Py_UCS1, Py_UCS4,
13477 ascii, ascii + len,
13478 (Py_UCS4 *)writer->data + writer->pos);
13479 break;
13480 }
13481 default:
13482 assert(0);
13483 }
13484
13485 writer->pos += len;
13486 return 0;
13487}
13488
13489int
13490_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
13491 const char *str, Py_ssize_t len)
Victor Stinnere215d962012-10-06 23:03:36 +020013492{
13493 Py_UCS4 maxchar;
13494
13495 maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len);
13496 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
13497 return -1;
13498 unicode_write_cstr(writer->buffer, writer->pos, str, len);
13499 writer->pos += len;
13500 return 0;
13501}
13502
Victor Stinnerd3f08822012-05-29 12:57:52 +020013503PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013504_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013505{
Victor Stinner15a0bd32013-07-08 22:29:55 +020013506 PyObject *str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013507 if (writer->pos == 0) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013508 Py_CLEAR(writer->buffer);
Serhiy Storchaka678db842013-01-26 12:16:36 +020013509 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3f08822012-05-29 12:57:52 +020013510 }
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013511 if (writer->readonly) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013512 str = writer->buffer;
13513 writer->buffer = NULL;
13514 assert(PyUnicode_GET_LENGTH(str) == writer->pos);
13515 return str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013516 }
13517 if (PyUnicode_GET_LENGTH(writer->buffer) != writer->pos) {
13518 PyObject *newbuffer;
13519 newbuffer = resize_compact(writer->buffer, writer->pos);
13520 if (newbuffer == NULL) {
Serhiy Storchakadfe98a12014-02-09 13:46:20 +020013521 Py_CLEAR(writer->buffer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013522 return NULL;
13523 }
13524 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013525 }
Victor Stinner15a0bd32013-07-08 22:29:55 +020013526 str = writer->buffer;
13527 writer->buffer = NULL;
13528 assert(_PyUnicode_CheckConsistency(str, 1));
13529 return unicode_result_ready(str);
Victor Stinner202fdca2012-05-07 12:47:02 +020013530}
13531
Victor Stinnerd3f08822012-05-29 12:57:52 +020013532void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013533_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013534{
13535 Py_CLEAR(writer->buffer);
13536}
13537
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013538#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000013539
13540PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013541 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013542\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013543Return a formatted version of S, using substitutions from args and kwargs.\n\
13544The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000013545
Eric Smith27bbca62010-11-04 17:06:58 +000013546PyDoc_STRVAR(format_map__doc__,
13547 "S.format_map(mapping) -> str\n\
13548\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013549Return a formatted version of S, using substitutions from mapping.\n\
13550The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000013551
Eric Smith4a7d76d2008-05-30 18:10:19 +000013552static PyObject *
13553unicode__format__(PyObject* self, PyObject* args)
13554{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013555 PyObject *format_spec;
13556 _PyUnicodeWriter writer;
13557 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000013558
13559 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
13560 return NULL;
13561
Victor Stinnerd3f08822012-05-29 12:57:52 +020013562 if (PyUnicode_READY(self) == -1)
13563 return NULL;
Victor Stinner8f674cc2013-04-17 23:02:17 +020013564 _PyUnicodeWriter_Init(&writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013565 ret = _PyUnicode_FormatAdvancedWriter(&writer,
13566 self, format_spec, 0,
13567 PyUnicode_GET_LENGTH(format_spec));
13568 if (ret == -1) {
13569 _PyUnicodeWriter_Dealloc(&writer);
13570 return NULL;
13571 }
13572 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000013573}
13574
Eric Smith8c663262007-08-25 02:26:07 +000013575PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013576 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013577\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013578Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000013579
13580static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013581unicode__sizeof__(PyObject *v)
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013582{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013583 Py_ssize_t size;
13584
13585 /* If it's a compact object, account for base structure +
13586 character data. */
13587 if (PyUnicode_IS_COMPACT_ASCII(v))
13588 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
13589 else if (PyUnicode_IS_COMPACT(v))
13590 size = sizeof(PyCompactUnicodeObject) +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013591 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013592 else {
13593 /* If it is a two-block object, account for base object, and
13594 for character block if present. */
13595 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020013596 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013597 size += (PyUnicode_GET_LENGTH(v) + 1) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013598 PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013599 }
13600 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020013601 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020013602 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013603 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020013604 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020013605 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013606
13607 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013608}
13609
13610PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013611 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013612
13613static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020013614unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013615{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010013616 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013617 if (!copy)
13618 return NULL;
13619 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013620}
13621
Guido van Rossumd57fd912000-03-10 22:53:23 +000013622static PyMethodDef unicode_methods[] = {
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000013623 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013624 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
Ezio Melotticda6b6d2012-02-26 09:39:55 +020013625 {"split", (PyCFunction) unicode_split, METH_VARARGS | METH_KEYWORDS, split__doc__},
13626 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS | METH_KEYWORDS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013627 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
13628 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
Benjamin Petersond5890c82012-01-14 13:23:30 -050013629 {"casefold", (PyCFunction) unicode_casefold, METH_NOARGS, casefold__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013630 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
13631 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
13632 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
Ezio Melotti745d54d2013-11-16 19:10:57 +020013633 {"expandtabs", (PyCFunction) unicode_expandtabs,
13634 METH_VARARGS | METH_KEYWORDS, expandtabs__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013635 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013636 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013637 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
13638 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
13639 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013640 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013641 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13642 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
13643 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013644 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013645 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Ezio Melotti745d54d2013-11-16 19:10:57 +020013646 {"splitlines", (PyCFunction) unicode_splitlines,
13647 METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013648 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013649 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
13650 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
13651 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
13652 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13653 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
13654 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
13655 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
13656 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
13657 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
13658 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
13659 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
13660 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
13661 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
13662 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000013663 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000013664 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013665 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000013666 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000013667 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000013668 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Larry Hastings31826802013-10-19 00:09:25 -070013669 UNICODE_MAKETRANS_METHODDEF
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013670 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000013671#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013672 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013673 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013674#endif
13675
Benjamin Peterson14339b62009-01-31 16:36:08 +000013676 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013677 {NULL, NULL}
13678};
13679
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013680static PyObject *
13681unicode_mod(PyObject *v, PyObject *w)
13682{
Brian Curtindfc80e32011-08-10 20:28:54 -050013683 if (!PyUnicode_Check(v))
13684 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000013685 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013686}
13687
13688static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013689 0, /*nb_add*/
13690 0, /*nb_subtract*/
13691 0, /*nb_multiply*/
13692 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013693};
13694
Guido van Rossumd57fd912000-03-10 22:53:23 +000013695static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013696 (lenfunc) unicode_length, /* sq_length */
13697 PyUnicode_Concat, /* sq_concat */
13698 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13699 (ssizeargfunc) unicode_getitem, /* sq_item */
13700 0, /* sq_slice */
13701 0, /* sq_ass_item */
13702 0, /* sq_ass_slice */
13703 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013704};
13705
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013706static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013707unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013708{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013709 if (PyUnicode_READY(self) == -1)
13710 return NULL;
13711
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013712 if (PyIndex_Check(item)) {
13713 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013714 if (i == -1 && PyErr_Occurred())
13715 return NULL;
13716 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013717 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013718 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013719 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000013720 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013721 PyObject *result;
13722 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013723 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013724 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013725
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013726 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000013727 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013728 return NULL;
13729 }
13730
13731 if (slicelength <= 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020013732 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013733 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010013734 slicelength == PyUnicode_GET_LENGTH(self)) {
13735 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000013736 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013737 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013738 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013739 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013740 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013741 src_kind = PyUnicode_KIND(self);
13742 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020013743 if (!PyUnicode_IS_ASCII(self)) {
13744 kind_limit = kind_maxchar_limit(src_kind);
13745 max_char = 0;
13746 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13747 ch = PyUnicode_READ(src_kind, src_data, cur);
13748 if (ch > max_char) {
13749 max_char = ch;
13750 if (max_char >= kind_limit)
13751 break;
13752 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013753 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013754 }
Victor Stinner55c99112011-10-13 01:17:06 +020013755 else
13756 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013757 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013758 if (result == NULL)
13759 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013760 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013761 dest_data = PyUnicode_DATA(result);
13762
13763 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013764 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13765 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013766 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013767 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013768 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013769 } else {
13770 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13771 return NULL;
13772 }
13773}
13774
13775static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013776 (lenfunc)unicode_length, /* mp_length */
13777 (binaryfunc)unicode_subscript, /* mp_subscript */
13778 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013779};
13780
Guido van Rossumd57fd912000-03-10 22:53:23 +000013781
Guido van Rossumd57fd912000-03-10 22:53:23 +000013782/* Helpers for PyUnicode_Format() */
13783
Victor Stinnera47082312012-10-04 02:19:54 +020013784struct unicode_formatter_t {
13785 PyObject *args;
13786 int args_owned;
13787 Py_ssize_t arglen, argidx;
13788 PyObject *dict;
13789
13790 enum PyUnicode_Kind fmtkind;
13791 Py_ssize_t fmtcnt, fmtpos;
13792 void *fmtdata;
13793 PyObject *fmtstr;
13794
13795 _PyUnicodeWriter writer;
13796};
13797
13798struct unicode_format_arg_t {
13799 Py_UCS4 ch;
13800 int flags;
13801 Py_ssize_t width;
13802 int prec;
13803 int sign;
13804};
13805
Guido van Rossumd57fd912000-03-10 22:53:23 +000013806static PyObject *
Victor Stinnera47082312012-10-04 02:19:54 +020013807unicode_format_getnextarg(struct unicode_formatter_t *ctx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013808{
Victor Stinnera47082312012-10-04 02:19:54 +020013809 Py_ssize_t argidx = ctx->argidx;
13810
13811 if (argidx < ctx->arglen) {
13812 ctx->argidx++;
13813 if (ctx->arglen < 0)
13814 return ctx->args;
Benjamin Peterson29060642009-01-31 22:14:21 +000013815 else
Victor Stinnera47082312012-10-04 02:19:54 +020013816 return PyTuple_GetItem(ctx->args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013817 }
13818 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013819 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013820 return NULL;
13821}
13822
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013823/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013824
Victor Stinnera47082312012-10-04 02:19:54 +020013825/* Format a float into the writer if the writer is not NULL, or into *p_output
13826 otherwise.
13827
13828 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +020013829static int
Victor Stinnera47082312012-10-04 02:19:54 +020013830formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
13831 PyObject **p_output,
13832 _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013833{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013834 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013835 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013836 Py_ssize_t len;
Victor Stinnera47082312012-10-04 02:19:54 +020013837 int prec;
13838 int dtoa_flags;
Tim Petersced69f82003-09-16 20:30:58 +000013839
Guido van Rossumd57fd912000-03-10 22:53:23 +000013840 x = PyFloat_AsDouble(v);
13841 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020013842 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013843
Victor Stinnera47082312012-10-04 02:19:54 +020013844 prec = arg->prec;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013845 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013846 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000013847
Victor Stinnera47082312012-10-04 02:19:54 +020013848 if (arg->flags & F_ALT)
13849 dtoa_flags = Py_DTSF_ALT;
13850 else
13851 dtoa_flags = 0;
13852 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013853 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020013854 return -1;
13855 len = strlen(p);
13856 if (writer) {
Victor Stinner4a587072013-11-19 12:54:53 +010013857 if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
Christian Heimesf4f99392012-09-10 11:48:41 +020013858 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013859 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020013860 }
Victor Stinnerd3f08822012-05-29 12:57:52 +020013861 }
13862 else
13863 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000013864 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013865 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013866}
13867
Victor Stinnerd0880d52012-04-27 23:40:13 +020013868/* formatlong() emulates the format codes d, u, o, x and X, and
13869 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
13870 * Python's regular ints.
13871 * Return value: a new PyUnicodeObject*, or NULL if error.
13872 * The output string is of the form
13873 * "-"? ("0x" | "0X")? digit+
13874 * "0x"/"0X" are present only for x and X conversions, with F_ALT
13875 * set in flags. The case of hex digits will be correct,
13876 * There will be at least prec digits, zero-filled on the left if
13877 * necessary to get that many.
13878 * val object to be converted
13879 * flags bitmask of format flags; only F_ALT is looked at
13880 * prec minimum number of digits; 0-fill on left if needed
13881 * type a character in [duoxX]; u acts the same as d
13882 *
13883 * CAUTION: o, x and X conversions on regular ints can never
13884 * produce a '-' sign, but can for Python's unbounded ints.
13885 */
Ethan Furmanb95b5612015-01-23 20:05:18 -080013886PyObject *
13887_PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
Tim Peters38fd5b62000-09-21 05:43:11 +000013888{
Victor Stinnerd0880d52012-04-27 23:40:13 +020013889 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013890 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013891 Py_ssize_t i;
13892 int sign; /* 1 if '-', else 0 */
13893 int len; /* number of characters */
13894 Py_ssize_t llen;
13895 int numdigits; /* len == numnondigits + numdigits */
13896 int numnondigits = 0;
Tim Peters38fd5b62000-09-21 05:43:11 +000013897
Victor Stinnerd0880d52012-04-27 23:40:13 +020013898 /* Avoid exceeding SSIZE_T_MAX */
13899 if (prec > INT_MAX-3) {
13900 PyErr_SetString(PyExc_OverflowError,
13901 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000013902 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013903 }
13904
13905 assert(PyLong_Check(val));
13906
13907 switch (type) {
Victor Stinner621ef3d2012-10-02 00:33:47 +020013908 default:
13909 assert(!"'type' not in [diuoxX]");
Victor Stinnerd0880d52012-04-27 23:40:13 +020013910 case 'd':
Victor Stinner621ef3d2012-10-02 00:33:47 +020013911 case 'i':
Victor Stinnerd0880d52012-04-27 23:40:13 +020013912 case 'u':
Ethan Furmanfb137212013-08-31 10:18:55 -070013913 /* int and int subclasses should print numerically when a numeric */
13914 /* format code is used (see issue18780) */
13915 result = PyNumber_ToBase(val, 10);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013916 break;
13917 case 'o':
13918 numnondigits = 2;
13919 result = PyNumber_ToBase(val, 8);
13920 break;
13921 case 'x':
13922 case 'X':
13923 numnondigits = 2;
13924 result = PyNumber_ToBase(val, 16);
13925 break;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013926 }
13927 if (!result)
13928 return NULL;
13929
13930 assert(unicode_modifiable(result));
13931 assert(PyUnicode_IS_READY(result));
13932 assert(PyUnicode_IS_ASCII(result));
13933
13934 /* To modify the string in-place, there can only be one reference. */
13935 if (Py_REFCNT(result) != 1) {
Christian Heimesd47802e2013-06-29 21:33:36 +020013936 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013937 PyErr_BadInternalCall();
13938 return NULL;
13939 }
13940 buf = PyUnicode_DATA(result);
13941 llen = PyUnicode_GET_LENGTH(result);
13942 if (llen > INT_MAX) {
Christian Heimesd47802e2013-06-29 21:33:36 +020013943 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013944 PyErr_SetString(PyExc_ValueError,
Ethan Furmanb95b5612015-01-23 20:05:18 -080013945 "string too large in _PyUnicode_FormatLong");
Victor Stinnerd0880d52012-04-27 23:40:13 +020013946 return NULL;
13947 }
13948 len = (int)llen;
13949 sign = buf[0] == '-';
13950 numnondigits += sign;
13951 numdigits = len - numnondigits;
13952 assert(numdigits > 0);
13953
13954 /* Get rid of base marker unless F_ALT */
Ethan Furmanb95b5612015-01-23 20:05:18 -080013955 if (((alt) == 0 &&
Victor Stinnerd0880d52012-04-27 23:40:13 +020013956 (type == 'o' || type == 'x' || type == 'X'))) {
13957 assert(buf[sign] == '0');
13958 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
13959 buf[sign+1] == 'o');
13960 numnondigits -= 2;
13961 buf += 2;
13962 len -= 2;
13963 if (sign)
13964 buf[0] = '-';
13965 assert(len == numnondigits + numdigits);
13966 assert(numdigits > 0);
13967 }
13968
13969 /* Fill with leading zeroes to meet minimum width. */
13970 if (prec > numdigits) {
13971 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
13972 numnondigits + prec);
13973 char *b1;
13974 if (!r1) {
13975 Py_DECREF(result);
13976 return NULL;
13977 }
13978 b1 = PyBytes_AS_STRING(r1);
13979 for (i = 0; i < numnondigits; ++i)
13980 *b1++ = *buf++;
13981 for (i = 0; i < prec - numdigits; i++)
13982 *b1++ = '0';
13983 for (i = 0; i < numdigits; i++)
13984 *b1++ = *buf++;
13985 *b1 = '\0';
13986 Py_DECREF(result);
13987 result = r1;
13988 buf = PyBytes_AS_STRING(result);
13989 len = numnondigits + prec;
13990 }
13991
13992 /* Fix up case for hex conversions. */
13993 if (type == 'X') {
13994 /* Need to convert all lower case letters to upper case.
13995 and need to convert 0x to 0X (and -0x to -0X). */
13996 for (i = 0; i < len; i++)
13997 if (buf[i] >= 'a' && buf[i] <= 'x')
13998 buf[i] -= 'a'-'A';
13999 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014000 if (!PyUnicode_Check(result)
14001 || buf != PyUnicode_DATA(result)) {
Victor Stinnerd0880d52012-04-27 23:40:13 +020014002 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014003 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014004 Py_DECREF(result);
14005 result = unicode;
14006 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014007 else if (len != PyUnicode_GET_LENGTH(result)) {
14008 if (PyUnicode_Resize(&result, len) < 0)
14009 Py_CLEAR(result);
14010 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000014011 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000014012}
14013
Ethan Furmandf3ed242014-01-05 06:50:30 -080014014/* Format an integer or a float as an integer.
Victor Stinner621ef3d2012-10-02 00:33:47 +020014015 * Return 1 if the number has been formatted into the writer,
Victor Stinnera47082312012-10-04 02:19:54 +020014016 * 0 if the number has been formatted into *p_output
Victor Stinner621ef3d2012-10-02 00:33:47 +020014017 * -1 and raise an exception on error */
14018static int
Victor Stinnera47082312012-10-04 02:19:54 +020014019mainformatlong(PyObject *v,
14020 struct unicode_format_arg_t *arg,
14021 PyObject **p_output,
14022 _PyUnicodeWriter *writer)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014023{
14024 PyObject *iobj, *res;
Victor Stinnera47082312012-10-04 02:19:54 +020014025 char type = (char)arg->ch;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014026
14027 if (!PyNumber_Check(v))
14028 goto wrongtype;
14029
Ethan Furman9ab74802014-03-21 06:38:46 -070014030 /* make sure number is a type of integer for o, x, and X */
Victor Stinner621ef3d2012-10-02 00:33:47 +020014031 if (!PyLong_Check(v)) {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014032 if (type == 'o' || type == 'x' || type == 'X') {
14033 iobj = PyNumber_Index(v);
14034 if (iobj == NULL) {
Ethan Furman9ab74802014-03-21 06:38:46 -070014035 if (PyErr_ExceptionMatches(PyExc_TypeError))
14036 goto wrongtype;
Ethan Furman38d872e2014-03-19 08:38:52 -070014037 return -1;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014038 }
14039 }
14040 else {
14041 iobj = PyNumber_Long(v);
14042 if (iobj == NULL ) {
14043 if (PyErr_ExceptionMatches(PyExc_TypeError))
14044 goto wrongtype;
14045 return -1;
14046 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014047 }
14048 assert(PyLong_Check(iobj));
14049 }
14050 else {
14051 iobj = v;
14052 Py_INCREF(iobj);
14053 }
14054
14055 if (PyLong_CheckExact(v)
Victor Stinnera47082312012-10-04 02:19:54 +020014056 && arg->width == -1 && arg->prec == -1
14057 && !(arg->flags & (F_SIGN | F_BLANK))
14058 && type != 'X')
Victor Stinner621ef3d2012-10-02 00:33:47 +020014059 {
14060 /* Fast path */
Victor Stinnera47082312012-10-04 02:19:54 +020014061 int alternate = arg->flags & F_ALT;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014062 int base;
14063
Victor Stinnera47082312012-10-04 02:19:54 +020014064 switch(type)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014065 {
14066 default:
14067 assert(0 && "'type' not in [diuoxX]");
14068 case 'd':
14069 case 'i':
14070 case 'u':
14071 base = 10;
14072 break;
14073 case 'o':
14074 base = 8;
14075 break;
14076 case 'x':
14077 case 'X':
14078 base = 16;
14079 break;
14080 }
14081
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014082 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14083 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014084 return -1;
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014085 }
14086 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014087 return 1;
14088 }
14089
Ethan Furmanb95b5612015-01-23 20:05:18 -080014090 res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014091 Py_DECREF(iobj);
14092 if (res == NULL)
14093 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014094 *p_output = res;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014095 return 0;
14096
14097wrongtype:
Ethan Furman9ab74802014-03-21 06:38:46 -070014098 switch(type)
14099 {
14100 case 'o':
14101 case 'x':
14102 case 'X':
14103 PyErr_Format(PyExc_TypeError,
14104 "%%%c format: an integer is required, "
14105 "not %.200s",
14106 type, Py_TYPE(v)->tp_name);
14107 break;
14108 default:
14109 PyErr_Format(PyExc_TypeError,
14110 "%%%c format: a number is required, "
14111 "not %.200s",
14112 type, Py_TYPE(v)->tp_name);
14113 break;
14114 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014115 return -1;
14116}
14117
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014118static Py_UCS4
14119formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014120{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014121 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014122 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014123 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014124 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000014125 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014126 goto onError;
14127 }
14128 else {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014129 PyObject *iobj;
Benjamin Peterson29060642009-01-31 22:14:21 +000014130 long x;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014131 /* make sure number is a type of integer */
14132 if (!PyLong_Check(v)) {
14133 iobj = PyNumber_Index(v);
14134 if (iobj == NULL) {
Ethan Furman38d872e2014-03-19 08:38:52 -070014135 goto onError;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014136 }
14137 v = iobj;
14138 Py_DECREF(iobj);
14139 }
14140 /* Integer input truncated to a character */
Benjamin Peterson29060642009-01-31 22:14:21 +000014141 x = PyLong_AsLong(v);
14142 if (x == -1 && PyErr_Occurred())
14143 goto onError;
14144
Victor Stinner8faf8212011-12-08 22:14:11 +010014145 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014146 PyErr_SetString(PyExc_OverflowError,
14147 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014148 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000014149 }
14150
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014151 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014152 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014153
Benjamin Peterson29060642009-01-31 22:14:21 +000014154 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014155 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014156 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014157 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014158}
14159
Victor Stinnera47082312012-10-04 02:19:54 +020014160/* Parse options of an argument: flags, width, precision.
14161 Handle also "%(name)" syntax.
14162
14163 Return 0 if the argument has been formatted into arg->str.
14164 Return 1 if the argument has been written into ctx->writer,
14165 Raise an exception and return -1 on error. */
14166static int
14167unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14168 struct unicode_format_arg_t *arg)
14169{
14170#define FORMAT_READ(ctx) \
14171 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14172
14173 PyObject *v;
14174
Victor Stinnera47082312012-10-04 02:19:54 +020014175 if (arg->ch == '(') {
14176 /* Get argument value from a dictionary. Example: "%(name)s". */
14177 Py_ssize_t keystart;
14178 Py_ssize_t keylen;
14179 PyObject *key;
14180 int pcount = 1;
14181
14182 if (ctx->dict == NULL) {
14183 PyErr_SetString(PyExc_TypeError,
14184 "format requires a mapping");
14185 return -1;
14186 }
14187 ++ctx->fmtpos;
14188 --ctx->fmtcnt;
14189 keystart = ctx->fmtpos;
14190 /* Skip over balanced parentheses */
14191 while (pcount > 0 && --ctx->fmtcnt >= 0) {
14192 arg->ch = FORMAT_READ(ctx);
14193 if (arg->ch == ')')
14194 --pcount;
14195 else if (arg->ch == '(')
14196 ++pcount;
14197 ctx->fmtpos++;
14198 }
14199 keylen = ctx->fmtpos - keystart - 1;
14200 if (ctx->fmtcnt < 0 || pcount > 0) {
14201 PyErr_SetString(PyExc_ValueError,
14202 "incomplete format key");
14203 return -1;
14204 }
14205 key = PyUnicode_Substring(ctx->fmtstr,
14206 keystart, keystart + keylen);
14207 if (key == NULL)
14208 return -1;
14209 if (ctx->args_owned) {
Victor Stinnera47082312012-10-04 02:19:54 +020014210 ctx->args_owned = 0;
Serhiy Storchaka191321d2015-12-27 15:41:34 +020014211 Py_DECREF(ctx->args);
Victor Stinnera47082312012-10-04 02:19:54 +020014212 }
14213 ctx->args = PyObject_GetItem(ctx->dict, key);
14214 Py_DECREF(key);
14215 if (ctx->args == NULL)
14216 return -1;
14217 ctx->args_owned = 1;
14218 ctx->arglen = -1;
14219 ctx->argidx = -2;
14220 }
14221
14222 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
Victor Stinnera47082312012-10-04 02:19:54 +020014223 while (--ctx->fmtcnt >= 0) {
14224 arg->ch = FORMAT_READ(ctx);
14225 ctx->fmtpos++;
14226 switch (arg->ch) {
14227 case '-': arg->flags |= F_LJUST; continue;
14228 case '+': arg->flags |= F_SIGN; continue;
14229 case ' ': arg->flags |= F_BLANK; continue;
14230 case '#': arg->flags |= F_ALT; continue;
14231 case '0': arg->flags |= F_ZERO; continue;
14232 }
14233 break;
14234 }
14235
14236 /* Parse width. Example: "%10s" => width=10 */
Victor Stinnera47082312012-10-04 02:19:54 +020014237 if (arg->ch == '*') {
14238 v = unicode_format_getnextarg(ctx);
14239 if (v == NULL)
14240 return -1;
14241 if (!PyLong_Check(v)) {
14242 PyErr_SetString(PyExc_TypeError,
14243 "* wants int");
14244 return -1;
14245 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014246 arg->width = PyLong_AsSsize_t(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014247 if (arg->width == -1 && PyErr_Occurred())
14248 return -1;
14249 if (arg->width < 0) {
14250 arg->flags |= F_LJUST;
14251 arg->width = -arg->width;
14252 }
14253 if (--ctx->fmtcnt >= 0) {
14254 arg->ch = FORMAT_READ(ctx);
14255 ctx->fmtpos++;
14256 }
14257 }
14258 else if (arg->ch >= '0' && arg->ch <= '9') {
14259 arg->width = arg->ch - '0';
14260 while (--ctx->fmtcnt >= 0) {
14261 arg->ch = FORMAT_READ(ctx);
14262 ctx->fmtpos++;
14263 if (arg->ch < '0' || arg->ch > '9')
14264 break;
14265 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14266 mixing signed and unsigned comparison. Since arg->ch is between
14267 '0' and '9', casting to int is safe. */
14268 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14269 PyErr_SetString(PyExc_ValueError,
14270 "width too big");
14271 return -1;
14272 }
14273 arg->width = arg->width*10 + (arg->ch - '0');
14274 }
14275 }
14276
14277 /* Parse precision. Example: "%.3f" => prec=3 */
Victor Stinnera47082312012-10-04 02:19:54 +020014278 if (arg->ch == '.') {
14279 arg->prec = 0;
14280 if (--ctx->fmtcnt >= 0) {
14281 arg->ch = FORMAT_READ(ctx);
14282 ctx->fmtpos++;
14283 }
14284 if (arg->ch == '*') {
14285 v = unicode_format_getnextarg(ctx);
14286 if (v == NULL)
14287 return -1;
14288 if (!PyLong_Check(v)) {
14289 PyErr_SetString(PyExc_TypeError,
14290 "* wants int");
14291 return -1;
14292 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014293 arg->prec = _PyLong_AsInt(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014294 if (arg->prec == -1 && PyErr_Occurred())
14295 return -1;
14296 if (arg->prec < 0)
14297 arg->prec = 0;
14298 if (--ctx->fmtcnt >= 0) {
14299 arg->ch = FORMAT_READ(ctx);
14300 ctx->fmtpos++;
14301 }
14302 }
14303 else if (arg->ch >= '0' && arg->ch <= '9') {
14304 arg->prec = arg->ch - '0';
14305 while (--ctx->fmtcnt >= 0) {
14306 arg->ch = FORMAT_READ(ctx);
14307 ctx->fmtpos++;
14308 if (arg->ch < '0' || arg->ch > '9')
14309 break;
14310 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
14311 PyErr_SetString(PyExc_ValueError,
Victor Stinner3921e902012-10-06 23:05:00 +020014312 "precision too big");
Victor Stinnera47082312012-10-04 02:19:54 +020014313 return -1;
14314 }
14315 arg->prec = arg->prec*10 + (arg->ch - '0');
14316 }
14317 }
14318 }
14319
14320 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
14321 if (ctx->fmtcnt >= 0) {
14322 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
14323 if (--ctx->fmtcnt >= 0) {
14324 arg->ch = FORMAT_READ(ctx);
14325 ctx->fmtpos++;
14326 }
14327 }
14328 }
14329 if (ctx->fmtcnt < 0) {
14330 PyErr_SetString(PyExc_ValueError,
14331 "incomplete format");
14332 return -1;
14333 }
14334 return 0;
14335
14336#undef FORMAT_READ
14337}
14338
14339/* Format one argument. Supported conversion specifiers:
14340
14341 - "s", "r", "a": any type
Ethan Furmandf3ed242014-01-05 06:50:30 -080014342 - "i", "d", "u": int or float
14343 - "o", "x", "X": int
Victor Stinnera47082312012-10-04 02:19:54 +020014344 - "e", "E", "f", "F", "g", "G": float
14345 - "c": int or str (1 character)
14346
Victor Stinner8dbd4212012-12-04 09:30:24 +010014347 When possible, the output is written directly into the Unicode writer
14348 (ctx->writer). A string is created when padding is required.
14349
Victor Stinnera47082312012-10-04 02:19:54 +020014350 Return 0 if the argument has been formatted into *p_str,
14351 1 if the argument has been written into ctx->writer,
Victor Stinner8dbd4212012-12-04 09:30:24 +010014352 -1 on error. */
Victor Stinnera47082312012-10-04 02:19:54 +020014353static int
14354unicode_format_arg_format(struct unicode_formatter_t *ctx,
14355 struct unicode_format_arg_t *arg,
14356 PyObject **p_str)
14357{
14358 PyObject *v;
14359 _PyUnicodeWriter *writer = &ctx->writer;
14360
14361 if (ctx->fmtcnt == 0)
14362 ctx->writer.overallocate = 0;
14363
14364 if (arg->ch == '%') {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014365 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014366 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014367 return 1;
14368 }
14369
14370 v = unicode_format_getnextarg(ctx);
14371 if (v == NULL)
14372 return -1;
14373
Victor Stinnera47082312012-10-04 02:19:54 +020014374
14375 switch (arg->ch) {
Victor Stinnera47082312012-10-04 02:19:54 +020014376 case 's':
14377 case 'r':
14378 case 'a':
14379 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
14380 /* Fast path */
14381 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
14382 return -1;
14383 return 1;
14384 }
14385
14386 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
14387 *p_str = v;
14388 Py_INCREF(*p_str);
14389 }
14390 else {
14391 if (arg->ch == 's')
14392 *p_str = PyObject_Str(v);
14393 else if (arg->ch == 'r')
14394 *p_str = PyObject_Repr(v);
14395 else
14396 *p_str = PyObject_ASCII(v);
14397 }
14398 break;
14399
14400 case 'i':
14401 case 'd':
14402 case 'u':
14403 case 'o':
14404 case 'x':
14405 case 'X':
14406 {
14407 int ret = mainformatlong(v, arg, p_str, writer);
14408 if (ret != 0)
14409 return ret;
14410 arg->sign = 1;
14411 break;
14412 }
14413
14414 case 'e':
14415 case 'E':
14416 case 'f':
14417 case 'F':
14418 case 'g':
14419 case 'G':
14420 if (arg->width == -1 && arg->prec == -1
14421 && !(arg->flags & (F_SIGN | F_BLANK)))
14422 {
14423 /* Fast path */
14424 if (formatfloat(v, arg, NULL, writer) == -1)
14425 return -1;
14426 return 1;
14427 }
14428
14429 arg->sign = 1;
14430 if (formatfloat(v, arg, p_str, NULL) == -1)
14431 return -1;
14432 break;
14433
14434 case 'c':
14435 {
14436 Py_UCS4 ch = formatchar(v);
14437 if (ch == (Py_UCS4) -1)
14438 return -1;
14439 if (arg->width == -1 && arg->prec == -1) {
14440 /* Fast path */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014441 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014442 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014443 return 1;
14444 }
14445 *p_str = PyUnicode_FromOrdinal(ch);
14446 break;
14447 }
14448
14449 default:
14450 PyErr_Format(PyExc_ValueError,
14451 "unsupported format character '%c' (0x%x) "
Victor Stinnera33bce02014-07-04 22:47:46 +020014452 "at index %zd",
Victor Stinnera47082312012-10-04 02:19:54 +020014453 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
14454 (int)arg->ch,
14455 ctx->fmtpos - 1);
14456 return -1;
14457 }
14458 if (*p_str == NULL)
14459 return -1;
14460 assert (PyUnicode_Check(*p_str));
14461 return 0;
14462}
14463
14464static int
14465unicode_format_arg_output(struct unicode_formatter_t *ctx,
14466 struct unicode_format_arg_t *arg,
14467 PyObject *str)
14468{
14469 Py_ssize_t len;
14470 enum PyUnicode_Kind kind;
14471 void *pbuf;
14472 Py_ssize_t pindex;
14473 Py_UCS4 signchar;
14474 Py_ssize_t buflen;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014475 Py_UCS4 maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014476 Py_ssize_t sublen;
14477 _PyUnicodeWriter *writer = &ctx->writer;
14478 Py_UCS4 fill;
14479
14480 fill = ' ';
14481 if (arg->sign && arg->flags & F_ZERO)
14482 fill = '0';
14483
14484 if (PyUnicode_READY(str) == -1)
14485 return -1;
14486
14487 len = PyUnicode_GET_LENGTH(str);
14488 if ((arg->width == -1 || arg->width <= len)
14489 && (arg->prec == -1 || arg->prec >= len)
14490 && !(arg->flags & (F_SIGN | F_BLANK)))
14491 {
14492 /* Fast path */
14493 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
14494 return -1;
14495 return 0;
14496 }
14497
14498 /* Truncate the string for "s", "r" and "a" formats
14499 if the precision is set */
14500 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
14501 if (arg->prec >= 0 && len > arg->prec)
14502 len = arg->prec;
14503 }
14504
14505 /* Adjust sign and width */
14506 kind = PyUnicode_KIND(str);
14507 pbuf = PyUnicode_DATA(str);
14508 pindex = 0;
14509 signchar = '\0';
14510 if (arg->sign) {
14511 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
14512 if (ch == '-' || ch == '+') {
14513 signchar = ch;
14514 len--;
14515 pindex++;
14516 }
14517 else if (arg->flags & F_SIGN)
14518 signchar = '+';
14519 else if (arg->flags & F_BLANK)
14520 signchar = ' ';
14521 else
14522 arg->sign = 0;
14523 }
14524 if (arg->width < len)
14525 arg->width = len;
14526
14527 /* Prepare the writer */
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014528 maxchar = writer->maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014529 if (!(arg->flags & F_LJUST)) {
14530 if (arg->sign) {
14531 if ((arg->width-1) > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014532 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014533 }
14534 else {
14535 if (arg->width > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014536 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014537 }
14538 }
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014539 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
14540 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014541 maxchar = Py_MAX(maxchar, strmaxchar);
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014542 }
14543
Victor Stinnera47082312012-10-04 02:19:54 +020014544 buflen = arg->width;
14545 if (arg->sign && len == arg->width)
14546 buflen++;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014547 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
Victor Stinnera47082312012-10-04 02:19:54 +020014548 return -1;
14549
14550 /* Write the sign if needed */
14551 if (arg->sign) {
14552 if (fill != ' ') {
14553 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14554 writer->pos += 1;
14555 }
14556 if (arg->width > len)
14557 arg->width--;
14558 }
14559
14560 /* Write the numeric prefix for "x", "X" and "o" formats
14561 if the alternate form is used.
14562 For example, write "0x" for the "%#x" format. */
14563 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14564 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14565 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
14566 if (fill != ' ') {
14567 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14568 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14569 writer->pos += 2;
14570 pindex += 2;
14571 }
14572 arg->width -= 2;
14573 if (arg->width < 0)
14574 arg->width = 0;
14575 len -= 2;
14576 }
14577
14578 /* Pad left with the fill character if needed */
14579 if (arg->width > len && !(arg->flags & F_LJUST)) {
14580 sublen = arg->width - len;
14581 FILL(writer->kind, writer->data, fill, writer->pos, sublen);
14582 writer->pos += sublen;
14583 arg->width = len;
14584 }
14585
14586 /* If padding with spaces: write sign if needed and/or numeric prefix if
14587 the alternate form is used */
14588 if (fill == ' ') {
14589 if (arg->sign) {
14590 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14591 writer->pos += 1;
14592 }
14593 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14594 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14595 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
14596 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14597 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14598 writer->pos += 2;
14599 pindex += 2;
14600 }
14601 }
14602
14603 /* Write characters */
14604 if (len) {
14605 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14606 str, pindex, len);
14607 writer->pos += len;
14608 }
14609
14610 /* Pad right with the fill character if needed */
14611 if (arg->width > len) {
14612 sublen = arg->width - len;
14613 FILL(writer->kind, writer->data, ' ', writer->pos, sublen);
14614 writer->pos += sublen;
14615 }
14616 return 0;
14617}
14618
14619/* Helper of PyUnicode_Format(): format one arg.
14620 Return 0 on success, raise an exception and return -1 on error. */
14621static int
14622unicode_format_arg(struct unicode_formatter_t *ctx)
14623{
14624 struct unicode_format_arg_t arg;
14625 PyObject *str;
14626 int ret;
14627
Victor Stinner8dbd4212012-12-04 09:30:24 +010014628 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
14629 arg.flags = 0;
14630 arg.width = -1;
14631 arg.prec = -1;
14632 arg.sign = 0;
14633 str = NULL;
14634
Victor Stinnera47082312012-10-04 02:19:54 +020014635 ret = unicode_format_arg_parse(ctx, &arg);
14636 if (ret == -1)
14637 return -1;
14638
14639 ret = unicode_format_arg_format(ctx, &arg, &str);
14640 if (ret == -1)
14641 return -1;
14642
14643 if (ret != 1) {
14644 ret = unicode_format_arg_output(ctx, &arg, str);
14645 Py_DECREF(str);
14646 if (ret == -1)
14647 return -1;
14648 }
14649
14650 if (ctx->dict && (ctx->argidx < ctx->arglen) && arg.ch != '%') {
14651 PyErr_SetString(PyExc_TypeError,
14652 "not all arguments converted during string formatting");
14653 return -1;
14654 }
14655 return 0;
14656}
14657
Alexander Belopolsky40018472011-02-26 01:02:56 +000014658PyObject *
14659PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014660{
Victor Stinnera47082312012-10-04 02:19:54 +020014661 struct unicode_formatter_t ctx;
Tim Petersced69f82003-09-16 20:30:58 +000014662
Guido van Rossumd57fd912000-03-10 22:53:23 +000014663 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014664 PyErr_BadInternalCall();
14665 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014666 }
Victor Stinnera47082312012-10-04 02:19:54 +020014667
14668 ctx.fmtstr = PyUnicode_FromObject(format);
14669 if (ctx.fmtstr == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000014670 return NULL;
Victor Stinnera47082312012-10-04 02:19:54 +020014671 if (PyUnicode_READY(ctx.fmtstr) == -1) {
14672 Py_DECREF(ctx.fmtstr);
14673 return NULL;
14674 }
14675 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
14676 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
14677 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
14678 ctx.fmtpos = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014679
Victor Stinner8f674cc2013-04-17 23:02:17 +020014680 _PyUnicodeWriter_Init(&ctx.writer);
14681 ctx.writer.min_length = ctx.fmtcnt + 100;
14682 ctx.writer.overallocate = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014683
Guido van Rossumd57fd912000-03-10 22:53:23 +000014684 if (PyTuple_Check(args)) {
Victor Stinnera47082312012-10-04 02:19:54 +020014685 ctx.arglen = PyTuple_Size(args);
14686 ctx.argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014687 }
14688 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014689 ctx.arglen = -1;
14690 ctx.argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014691 }
Victor Stinnera47082312012-10-04 02:19:54 +020014692 ctx.args_owned = 0;
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040014693 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Victor Stinnera47082312012-10-04 02:19:54 +020014694 ctx.dict = args;
14695 else
14696 ctx.dict = NULL;
14697 ctx.args = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014698
Victor Stinnera47082312012-10-04 02:19:54 +020014699 while (--ctx.fmtcnt >= 0) {
14700 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
Victor Stinnercfc4c132013-04-03 01:48:39 +020014701 Py_ssize_t nonfmtpos;
Victor Stinnera47082312012-10-04 02:19:54 +020014702
14703 nonfmtpos = ctx.fmtpos++;
14704 while (ctx.fmtcnt >= 0 &&
14705 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
14706 ctx.fmtpos++;
14707 ctx.fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014708 }
Victor Stinnera47082312012-10-04 02:19:54 +020014709 if (ctx.fmtcnt < 0) {
14710 ctx.fmtpos--;
14711 ctx.writer.overallocate = 0;
Victor Stinnera0494432012-10-03 23:03:46 +020014712 }
Victor Stinneree4544c2012-05-09 22:24:08 +020014713
Victor Stinnercfc4c132013-04-03 01:48:39 +020014714 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
14715 nonfmtpos, ctx.fmtpos) < 0)
14716 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014717 }
14718 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014719 ctx.fmtpos++;
14720 if (unicode_format_arg(&ctx) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000014721 goto onError;
Victor Stinnera47082312012-10-04 02:19:54 +020014722 }
14723 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020014724
Victor Stinnera47082312012-10-04 02:19:54 +020014725 if (ctx.argidx < ctx.arglen && !ctx.dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014726 PyErr_SetString(PyExc_TypeError,
14727 "not all arguments converted during string formatting");
14728 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014729 }
14730
Victor Stinnera47082312012-10-04 02:19:54 +020014731 if (ctx.args_owned) {
14732 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014733 }
Victor Stinnera47082312012-10-04 02:19:54 +020014734 Py_DECREF(ctx.fmtstr);
14735 return _PyUnicodeWriter_Finish(&ctx.writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014736
Benjamin Peterson29060642009-01-31 22:14:21 +000014737 onError:
Victor Stinnera47082312012-10-04 02:19:54 +020014738 Py_DECREF(ctx.fmtstr);
14739 _PyUnicodeWriter_Dealloc(&ctx.writer);
14740 if (ctx.args_owned) {
14741 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014742 }
14743 return NULL;
14744}
14745
Jeremy Hylton938ace62002-07-17 16:30:39 +000014746static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000014747unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
14748
Tim Peters6d6c1a32001-08-02 04:15:00 +000014749static PyObject *
14750unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14751{
Benjamin Peterson29060642009-01-31 22:14:21 +000014752 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014753 static char *kwlist[] = {"object", "encoding", "errors", 0};
14754 char *encoding = NULL;
14755 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000014756
Benjamin Peterson14339b62009-01-31 16:36:08 +000014757 if (type != &PyUnicode_Type)
14758 return unicode_subtype_new(type, args, kwds);
14759 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000014760 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000014761 return NULL;
14762 if (x == NULL)
Serhiy Storchaka678db842013-01-26 12:16:36 +020014763 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson14339b62009-01-31 16:36:08 +000014764 if (encoding == NULL && errors == NULL)
14765 return PyObject_Str(x);
14766 else
Benjamin Peterson29060642009-01-31 22:14:21 +000014767 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000014768}
14769
Guido van Rossume023fe02001-08-30 03:12:59 +000014770static PyObject *
14771unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14772{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014773 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014774 Py_ssize_t length, char_size;
14775 int share_wstr, share_utf8;
14776 unsigned int kind;
14777 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000014778
Benjamin Peterson14339b62009-01-31 16:36:08 +000014779 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014780
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014781 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014782 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014783 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014784 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050014785 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060014786 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014787 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060014788 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014789
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014790 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014791 if (self == NULL) {
14792 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014793 return NULL;
14794 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014795 kind = PyUnicode_KIND(unicode);
14796 length = PyUnicode_GET_LENGTH(unicode);
14797
14798 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014799#ifdef Py_DEBUG
14800 _PyUnicode_HASH(self) = -1;
14801#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014802 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014803#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014804 _PyUnicode_STATE(self).interned = 0;
14805 _PyUnicode_STATE(self).kind = kind;
14806 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020014807 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014808 _PyUnicode_STATE(self).ready = 1;
14809 _PyUnicode_WSTR(self) = NULL;
14810 _PyUnicode_UTF8_LENGTH(self) = 0;
14811 _PyUnicode_UTF8(self) = NULL;
14812 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020014813 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014814
14815 share_utf8 = 0;
14816 share_wstr = 0;
14817 if (kind == PyUnicode_1BYTE_KIND) {
14818 char_size = 1;
14819 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
14820 share_utf8 = 1;
14821 }
14822 else if (kind == PyUnicode_2BYTE_KIND) {
14823 char_size = 2;
14824 if (sizeof(wchar_t) == 2)
14825 share_wstr = 1;
14826 }
14827 else {
14828 assert(kind == PyUnicode_4BYTE_KIND);
14829 char_size = 4;
14830 if (sizeof(wchar_t) == 4)
14831 share_wstr = 1;
14832 }
14833
14834 /* Ensure we won't overflow the length. */
14835 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
14836 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014837 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014838 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014839 data = PyObject_MALLOC((length + 1) * char_size);
14840 if (data == NULL) {
14841 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014842 goto onError;
14843 }
14844
Victor Stinnerc3c74152011-10-02 20:39:55 +020014845 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014846 if (share_utf8) {
14847 _PyUnicode_UTF8_LENGTH(self) = length;
14848 _PyUnicode_UTF8(self) = data;
14849 }
14850 if (share_wstr) {
14851 _PyUnicode_WSTR_LENGTH(self) = length;
14852 _PyUnicode_WSTR(self) = (wchar_t *)data;
14853 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014854
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014855 Py_MEMCPY(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020014856 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014857 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014858#ifdef Py_DEBUG
14859 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
14860#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020014861 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010014862 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014863
14864onError:
14865 Py_DECREF(unicode);
14866 Py_DECREF(self);
14867 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000014868}
14869
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000014870PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070014871"str(object='') -> str\n\
14872str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000014873\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100014874Create a new string object from the given object. If encoding or\n\
14875errors is specified, then the object must expose a data buffer\n\
14876that will be decoded using the given encoding and error handler.\n\
14877Otherwise, returns the result of object.__str__() (if defined)\n\
14878or repr(object).\n\
14879encoding defaults to sys.getdefaultencoding().\n\
14880errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000014881
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014882static PyObject *unicode_iter(PyObject *seq);
14883
Guido van Rossumd57fd912000-03-10 22:53:23 +000014884PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000014885 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014886 "str", /* tp_name */
14887 sizeof(PyUnicodeObject), /* tp_size */
14888 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014889 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014890 (destructor)unicode_dealloc, /* tp_dealloc */
14891 0, /* tp_print */
14892 0, /* tp_getattr */
14893 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014894 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014895 unicode_repr, /* tp_repr */
14896 &unicode_as_number, /* tp_as_number */
14897 &unicode_as_sequence, /* tp_as_sequence */
14898 &unicode_as_mapping, /* tp_as_mapping */
14899 (hashfunc) unicode_hash, /* tp_hash*/
14900 0, /* tp_call*/
14901 (reprfunc) unicode_str, /* tp_str */
14902 PyObject_GenericGetAttr, /* tp_getattro */
14903 0, /* tp_setattro */
14904 0, /* tp_as_buffer */
14905 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000014906 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014907 unicode_doc, /* tp_doc */
14908 0, /* tp_traverse */
14909 0, /* tp_clear */
14910 PyUnicode_RichCompare, /* tp_richcompare */
14911 0, /* tp_weaklistoffset */
14912 unicode_iter, /* tp_iter */
14913 0, /* tp_iternext */
14914 unicode_methods, /* tp_methods */
14915 0, /* tp_members */
14916 0, /* tp_getset */
14917 &PyBaseObject_Type, /* tp_base */
14918 0, /* tp_dict */
14919 0, /* tp_descr_get */
14920 0, /* tp_descr_set */
14921 0, /* tp_dictoffset */
14922 0, /* tp_init */
14923 0, /* tp_alloc */
14924 unicode_new, /* tp_new */
14925 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014926};
14927
14928/* Initialize the Unicode implementation */
14929
Victor Stinner3a50e702011-10-18 21:21:00 +020014930int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014931{
Thomas Wouters477c8d52006-05-27 19:21:47 +000014932 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014933 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000014934 0x000A, /* LINE FEED */
14935 0x000D, /* CARRIAGE RETURN */
14936 0x001C, /* FILE SEPARATOR */
14937 0x001D, /* GROUP SEPARATOR */
14938 0x001E, /* RECORD SEPARATOR */
14939 0x0085, /* NEXT LINE */
14940 0x2028, /* LINE SEPARATOR */
14941 0x2029, /* PARAGRAPH SEPARATOR */
14942 };
14943
Fred Drakee4315f52000-05-09 19:53:39 +000014944 /* Init the implementation */
Serhiy Storchaka678db842013-01-26 12:16:36 +020014945 _Py_INCREF_UNICODE_EMPTY();
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014946 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014947 Py_FatalError("Can't create empty string");
Serhiy Storchaka678db842013-01-26 12:16:36 +020014948 Py_DECREF(unicode_empty);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014949
Guido van Rossumcacfc072002-05-24 19:01:59 +000014950 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014951 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000014952
14953 /* initialize the linebreak bloom filter */
14954 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014955 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020014956 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014957
Christian Heimes26532f72013-07-20 14:57:16 +020014958 if (PyType_Ready(&EncodingMapType) < 0)
14959 Py_FatalError("Can't initialize encoding map type");
Victor Stinner3a50e702011-10-18 21:21:00 +020014960
Benjamin Petersonc4311282012-10-30 23:21:10 -040014961 if (PyType_Ready(&PyFieldNameIter_Type) < 0)
14962 Py_FatalError("Can't initialize field name iterator type");
14963
14964 if (PyType_Ready(&PyFormatterIter_Type) < 0)
14965 Py_FatalError("Can't initialize formatter iter type");
Benjamin Petersone8ea97f2012-10-30 23:27:52 -040014966
Victor Stinner3a50e702011-10-18 21:21:00 +020014967 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014968}
14969
14970/* Finalize the Unicode implementation */
14971
Christian Heimesa156e092008-02-16 07:38:31 +000014972int
14973PyUnicode_ClearFreeList(void)
14974{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014975 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000014976}
14977
Guido van Rossumd57fd912000-03-10 22:53:23 +000014978void
Thomas Wouters78890102000-07-22 19:25:51 +000014979_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014980{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014981 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014982
Serhiy Storchaka05997252013-01-26 12:14:02 +020014983 Py_CLEAR(unicode_empty);
Barry Warsaw5b4c2282000-10-03 20:45:26 +000014984
Serhiy Storchaka05997252013-01-26 12:14:02 +020014985 for (i = 0; i < 256; i++)
14986 Py_CLEAR(unicode_latin1[i]);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020014987 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000014988 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000014989}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000014990
Walter Dörwald16807132007-05-25 13:52:07 +000014991void
14992PyUnicode_InternInPlace(PyObject **p)
14993{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020014994 PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014995 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020014996#ifdef Py_DEBUG
14997 assert(s != NULL);
14998 assert(_PyUnicode_CHECK(s));
14999#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000015000 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020015001 return;
15002#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015003 /* If it's a subclass, we don't really know what putting
15004 it in the interned dict might do. */
15005 if (!PyUnicode_CheckExact(s))
15006 return;
15007 if (PyUnicode_CHECK_INTERNED(s))
15008 return;
15009 if (interned == NULL) {
15010 interned = PyDict_New();
15011 if (interned == NULL) {
15012 PyErr_Clear(); /* Don't leave an exception */
15013 return;
15014 }
15015 }
15016 /* It might be that the GetItem call fails even
15017 though the key is present in the dictionary,
15018 namely when this happens during a stack overflow. */
15019 Py_ALLOW_RECURSION
Victor Stinner7931d9a2011-11-04 00:22:48 +010015020 t = PyDict_GetItem(interned, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015021 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000015022
Victor Stinnerf0335102013-04-14 19:13:03 +020015023 if (t) {
15024 Py_INCREF(t);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030015025 Py_SETREF(*p, t);
Victor Stinnerf0335102013-04-14 19:13:03 +020015026 return;
15027 }
Walter Dörwald16807132007-05-25 13:52:07 +000015028
Benjamin Peterson14339b62009-01-31 16:36:08 +000015029 PyThreadState_GET()->recursion_critical = 1;
Victor Stinner7931d9a2011-11-04 00:22:48 +010015030 if (PyDict_SetItem(interned, s, s) < 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015031 PyErr_Clear();
15032 PyThreadState_GET()->recursion_critical = 0;
15033 return;
15034 }
15035 PyThreadState_GET()->recursion_critical = 0;
15036 /* The two references in interned are not counted by refcnt.
15037 The deallocator will take care of this */
15038 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015039 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000015040}
15041
15042void
15043PyUnicode_InternImmortal(PyObject **p)
15044{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015045 PyUnicode_InternInPlace(p);
15046 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020015047 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015048 Py_INCREF(*p);
15049 }
Walter Dörwald16807132007-05-25 13:52:07 +000015050}
15051
15052PyObject *
15053PyUnicode_InternFromString(const char *cp)
15054{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015055 PyObject *s = PyUnicode_FromString(cp);
15056 if (s == NULL)
15057 return NULL;
15058 PyUnicode_InternInPlace(&s);
15059 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000015060}
15061
Alexander Belopolsky40018472011-02-26 01:02:56 +000015062void
15063_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000015064{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015065 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015066 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015067 Py_ssize_t i, n;
15068 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000015069
Benjamin Peterson14339b62009-01-31 16:36:08 +000015070 if (interned == NULL || !PyDict_Check(interned))
15071 return;
15072 keys = PyDict_Keys(interned);
15073 if (keys == NULL || !PyList_Check(keys)) {
15074 PyErr_Clear();
15075 return;
15076 }
Walter Dörwald16807132007-05-25 13:52:07 +000015077
Benjamin Peterson14339b62009-01-31 16:36:08 +000015078 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
15079 detector, interned unicode strings are not forcibly deallocated;
15080 rather, we give them their stolen references back, and then clear
15081 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000015082
Benjamin Peterson14339b62009-01-31 16:36:08 +000015083 n = PyList_GET_SIZE(keys);
15084 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000015085 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015086 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015087 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015088 if (PyUnicode_READY(s) == -1) {
15089 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015090 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015091 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015092 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015093 case SSTATE_NOT_INTERNED:
15094 /* XXX Shouldn't happen */
15095 break;
15096 case SSTATE_INTERNED_IMMORTAL:
15097 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015098 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015099 break;
15100 case SSTATE_INTERNED_MORTAL:
15101 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015102 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015103 break;
15104 default:
15105 Py_FatalError("Inconsistent interned string state.");
15106 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015107 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015108 }
15109 fprintf(stderr, "total size of all interned strings: "
15110 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
15111 "mortal/immortal\n", mortal_size, immortal_size);
15112 Py_DECREF(keys);
15113 PyDict_Clear(interned);
Serhiy Storchaka05997252013-01-26 12:14:02 +020015114 Py_CLEAR(interned);
Walter Dörwald16807132007-05-25 13:52:07 +000015115}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015116
15117
15118/********************* Unicode Iterator **************************/
15119
15120typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015121 PyObject_HEAD
15122 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015123 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015124} unicodeiterobject;
15125
15126static void
15127unicodeiter_dealloc(unicodeiterobject *it)
15128{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015129 _PyObject_GC_UNTRACK(it);
15130 Py_XDECREF(it->it_seq);
15131 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015132}
15133
15134static int
15135unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15136{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015137 Py_VISIT(it->it_seq);
15138 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015139}
15140
15141static PyObject *
15142unicodeiter_next(unicodeiterobject *it)
15143{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015144 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015145
Benjamin Peterson14339b62009-01-31 16:36:08 +000015146 assert(it != NULL);
15147 seq = it->it_seq;
15148 if (seq == NULL)
15149 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015150 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015151
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015152 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15153 int kind = PyUnicode_KIND(seq);
15154 void *data = PyUnicode_DATA(seq);
15155 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15156 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015157 if (item != NULL)
15158 ++it->it_index;
15159 return item;
15160 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015161
Benjamin Peterson14339b62009-01-31 16:36:08 +000015162 it->it_seq = NULL;
Serhiy Storchakafbb1c5e2016-03-30 20:40:02 +030015163 Py_DECREF(seq);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015164 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015165}
15166
15167static PyObject *
15168unicodeiter_len(unicodeiterobject *it)
15169{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015170 Py_ssize_t len = 0;
15171 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020015172 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015173 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015174}
15175
15176PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15177
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015178static PyObject *
15179unicodeiter_reduce(unicodeiterobject *it)
15180{
15181 if (it->it_seq != NULL) {
Antoine Pitroua7013882012-04-05 00:04:20 +020015182 return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015183 it->it_seq, it->it_index);
15184 } else {
15185 PyObject *u = PyUnicode_FromUnicode(NULL, 0);
15186 if (u == NULL)
15187 return NULL;
Antoine Pitroua7013882012-04-05 00:04:20 +020015188 return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015189 }
15190}
15191
15192PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15193
15194static PyObject *
15195unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15196{
15197 Py_ssize_t index = PyLong_AsSsize_t(state);
15198 if (index == -1 && PyErr_Occurred())
15199 return NULL;
Kristján Valur Jónsson25dded02014-03-05 13:47:57 +000015200 if (it->it_seq != NULL) {
15201 if (index < 0)
15202 index = 0;
15203 else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15204 index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15205 it->it_index = index;
15206 }
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015207 Py_RETURN_NONE;
15208}
15209
15210PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15211
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015212static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015213 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000015214 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015215 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15216 reduce_doc},
15217 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
15218 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000015219 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015220};
15221
15222PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015223 PyVarObject_HEAD_INIT(&PyType_Type, 0)
15224 "str_iterator", /* tp_name */
15225 sizeof(unicodeiterobject), /* tp_basicsize */
15226 0, /* tp_itemsize */
15227 /* methods */
15228 (destructor)unicodeiter_dealloc, /* tp_dealloc */
15229 0, /* tp_print */
15230 0, /* tp_getattr */
15231 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000015232 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015233 0, /* tp_repr */
15234 0, /* tp_as_number */
15235 0, /* tp_as_sequence */
15236 0, /* tp_as_mapping */
15237 0, /* tp_hash */
15238 0, /* tp_call */
15239 0, /* tp_str */
15240 PyObject_GenericGetAttr, /* tp_getattro */
15241 0, /* tp_setattro */
15242 0, /* tp_as_buffer */
15243 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15244 0, /* tp_doc */
15245 (traverseproc)unicodeiter_traverse, /* tp_traverse */
15246 0, /* tp_clear */
15247 0, /* tp_richcompare */
15248 0, /* tp_weaklistoffset */
15249 PyObject_SelfIter, /* tp_iter */
15250 (iternextfunc)unicodeiter_next, /* tp_iternext */
15251 unicodeiter_methods, /* tp_methods */
15252 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015253};
15254
15255static PyObject *
15256unicode_iter(PyObject *seq)
15257{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015258 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015259
Benjamin Peterson14339b62009-01-31 16:36:08 +000015260 if (!PyUnicode_Check(seq)) {
15261 PyErr_BadInternalCall();
15262 return NULL;
15263 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015264 if (PyUnicode_READY(seq) == -1)
15265 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015266 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15267 if (it == NULL)
15268 return NULL;
15269 it->it_index = 0;
15270 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015271 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015272 _PyObject_GC_TRACK(it);
15273 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015274}
15275
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015276
15277size_t
15278Py_UNICODE_strlen(const Py_UNICODE *u)
15279{
15280 int res = 0;
15281 while(*u++)
15282 res++;
15283 return res;
15284}
15285
15286Py_UNICODE*
15287Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
15288{
15289 Py_UNICODE *u = s1;
15290 while ((*u++ = *s2++));
15291 return s1;
15292}
15293
15294Py_UNICODE*
15295Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15296{
15297 Py_UNICODE *u = s1;
15298 while ((*u++ = *s2++))
15299 if (n-- == 0)
15300 break;
15301 return s1;
15302}
15303
15304Py_UNICODE*
15305Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
15306{
15307 Py_UNICODE *u1 = s1;
15308 u1 += Py_UNICODE_strlen(u1);
15309 Py_UNICODE_strcpy(u1, s2);
15310 return s1;
15311}
15312
15313int
15314Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
15315{
15316 while (*s1 && *s2 && *s1 == *s2)
15317 s1++, s2++;
15318 if (*s1 && *s2)
15319 return (*s1 < *s2) ? -1 : +1;
15320 if (*s1)
15321 return 1;
15322 if (*s2)
15323 return -1;
15324 return 0;
15325}
15326
15327int
15328Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15329{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015330 Py_UNICODE u1, u2;
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015331 for (; n != 0; n--) {
15332 u1 = *s1;
15333 u2 = *s2;
15334 if (u1 != u2)
15335 return (u1 < u2) ? -1 : +1;
15336 if (u1 == '\0')
15337 return 0;
15338 s1++;
15339 s2++;
15340 }
15341 return 0;
15342}
15343
15344Py_UNICODE*
15345Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
15346{
15347 const Py_UNICODE *p;
15348 for (p = s; *p; p++)
15349 if (*p == c)
15350 return (Py_UNICODE*)p;
15351 return NULL;
15352}
15353
15354Py_UNICODE*
15355Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
15356{
15357 const Py_UNICODE *p;
15358 p = s + Py_UNICODE_strlen(s);
15359 while (p != s) {
15360 p--;
15361 if (*p == c)
15362 return (Py_UNICODE*)p;
15363 }
15364 return NULL;
15365}
Victor Stinner331ea922010-08-10 16:37:20 +000015366
Victor Stinner71133ff2010-09-01 23:43:53 +000015367Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015368PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000015369{
Victor Stinner577db2c2011-10-11 22:12:48 +020015370 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015371 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000015372
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015373 if (!PyUnicode_Check(unicode)) {
15374 PyErr_BadArgument();
15375 return NULL;
15376 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015377 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020015378 if (u == NULL)
15379 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000015380 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -070015381 if (len > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000015382 PyErr_NoMemory();
15383 return NULL;
15384 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015385 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000015386 size *= sizeof(Py_UNICODE);
15387 copy = PyMem_Malloc(size);
15388 if (copy == NULL) {
15389 PyErr_NoMemory();
15390 return NULL;
15391 }
Victor Stinner577db2c2011-10-11 22:12:48 +020015392 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000015393 return copy;
15394}
Martin v. Löwis5b222132007-06-10 09:51:05 +000015395
Georg Brandl66c221e2010-10-14 07:04:07 +000015396/* A _string module, to export formatter_parser and formatter_field_name_split
15397 to the string.Formatter class implemented in Python. */
15398
15399static PyMethodDef _string_methods[] = {
15400 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
15401 METH_O, PyDoc_STR("split the argument as a field name")},
15402 {"formatter_parser", (PyCFunction) formatter_parser,
15403 METH_O, PyDoc_STR("parse the argument as a format string")},
15404 {NULL, NULL}
15405};
15406
15407static struct PyModuleDef _string_module = {
15408 PyModuleDef_HEAD_INIT,
15409 "_string",
15410 PyDoc_STR("string helper module"),
15411 0,
15412 _string_methods,
15413 NULL,
15414 NULL,
15415 NULL,
15416 NULL
15417};
15418
15419PyMODINIT_FUNC
15420PyInit__string(void)
15421{
15422 return PyModule_Create(&_string_module);
15423}
15424
15425
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000015426#ifdef __cplusplus
15427}
15428#endif